diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2cea2e504ab..27e1999cb75 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -167,6 +167,7 @@ set(CUGRAPH_SOURCES
     src/detail/groupby_and_count_mg_v64_e64.cu
     src/detail/collect_comm_wrapper_mg_v32_e32.cu
     src/detail/collect_comm_wrapper_mg_v64_e64.cu
+    src/sampling/detail/conversion_utilities.cu
     src/sampling/random_walks_mg_v64_e64.cu
     src/sampling/random_walks_mg_v32_e32.cu
     src/community/detail/common_methods_mg_v64_e64.cu
@@ -264,10 +265,10 @@ set(CUGRAPH_SOURCES
     src/sampling/detail/sample_edges_mg_v32_e32.cu
     src/sampling/detail/shuffle_and_organize_output_mg_v64_e64.cu
     src/sampling/detail/shuffle_and_organize_output_mg_v32_e32.cu
-    src/sampling/neighbor_sampling_mg_v32_e32.cpp
-    src/sampling/neighbor_sampling_mg_v64_e64.cpp
-    src/sampling/neighbor_sampling_sg_v32_e32.cpp
-    src/sampling/neighbor_sampling_sg_v64_e64.cpp
+    src/sampling/neighbor_sampling_mg_v32_e32.cu
+    src/sampling/neighbor_sampling_mg_v64_e64.cu
+    src/sampling/neighbor_sampling_sg_v32_e32.cu
+    src/sampling/neighbor_sampling_sg_v64_e64.cu
     src/sampling/negative_sampling_sg_v32_e32.cu
     src/sampling/negative_sampling_sg_v64_e64.cu
     src/sampling/negative_sampling_mg_v32_e32.cu
diff --git a/cpp/include/cugraph/detail/utility_wrappers.hpp b/cpp/include/cugraph/detail/utility_wrappers.hpp
index 3d99b85556b..b1afeafd66b 100644
--- a/cpp/include/cugraph/detail/utility_wrappers.hpp
+++ b/cpp/include/cugraph/detail/utility_wrappers.hpp
@@ -65,6 +65,48 @@ void uniform_random_fill(rmm::cuda_stream_view const& stream_view,
 template <typename value_t>
 void scalar_fill(raft::handle_t const& handle, value_t* d_value, size_t size, value_t value);
 
+/**
+ * @brief    Sort a device span
+ *
+ * @tparam      value_t      type of the value to operate on. Must be either int32_t or int64_t.
+ *
+ * @param [in]  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
+ * and handles to various CUDA libraries) to run graph algorithms.
+ * @param[out]  values      device span to sort
+ *
+ */
+template <typename value_t>
+void sort_ints(raft::handle_t const& handle, raft::device_span<value_t> values);
+
+/**
+ * @brief    Keep unique element from a device span
+ *
+ * @tparam      value_t      type of the value to operate on. Must be either int32_t or int64_t.
+ *
+ * @param [in]  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
+ * and handles to various CUDA libraries) to run graph algorithms.
+ * @param[in]  values      device span of unique elements.
+ * @return the number of unique elements.
+ *
+ */
+template <typename value_t>
+size_t unique_ints(raft::handle_t const& handle, raft::device_span<value_t> values);
+
+/**
+ * @brief    Increment the values of a device span by a constant value
+ *
+ * @tparam      value_t      type of the value to operate on. Must be either int32_t or int64_t.
+ *
+ * @param[out]  values       device span to update
+ * @param[in]   value        value to be added to each element of the buffer
+ * @param[in]   stream_view  stream view
+ *
+ */
+template <typename value_t>
+void transform_increment_ints(raft::device_span<value_t> values,
+                              value_t value,
+                              rmm::cuda_stream_view const& stream_view);
+
 /**
  * @brief    Fill a buffer with a sequence of values
  *
@@ -73,7 +115,7 @@ void scalar_fill(raft::handle_t const& handle, value_t* d_value, size_t size, va
  *
  * Similar to the function std::iota, wraps the function thrust::sequence
  *
- * @tparam      value_t      type of the value to operate on
+ * @tparam      value_t      type of the value to operate on.
  *
  * @param[in]   stream_view  stream view
  * @param[out]  d_value      device array to fill
diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp
index 783cd3a7e2b..3d41e954416 100644
--- a/cpp/include/cugraph/sampling_functions.hpp
+++ b/cpp/include/cugraph/sampling_functions.hpp
@@ -43,6 +43,8 @@ enum class prior_sources_behavior_t { DEFAULT = 0, CARRY_OVER, EXCLUDE };
 /**
  * @brief Uniform Neighborhood Sampling.
  *
+ * @deprecated Replaced with homogeneous_uniform_neighbor_sample
+ *
  * This function traverses from a set of starting vertices, traversing outgoing edges and
  * randomly selects from these outgoing neighbors to extract a subgraph.
  *
@@ -53,19 +55,20 @@ enum class prior_sources_behavior_t { DEFAULT = 0, CARRY_OVER, EXCLUDE };
  * encountered in.  The label output (optional) identifes the vertex label.  The offsets array
  * (optional) will be described below and is dependent upon the input parameters.
  *
- * If @p starting_vertex_labels is not specified then no organization is applied to the output, the
- * label and offsets values in the return set will be std::nullopt.
+ * If @p starting_vertex_label_offsets is not specified then no organization is applied to the
+ * output, the label and offsets values in the return set will be std::nullopt.
  *
- * If @p starting_vertex_labels is specified and @p label_to_output_comm_rank is not specified then
- * the label output has values.  This will also result in the output being sorted by vertex label.
- * The offsets array in the return will be a CSR-style offsets array to identify the beginning of
- * each label range in the data.  `labels.size() == (offsets.size() - 1)`.
+ * If @p starting_vertex_label_offsets is specified and @p label_to_output_comm_rank is not
+ * specified then the label output has values.  This will also result in the output being sorted by
+ * vertex label. The offsets array in the return will be a CSR-style offsets array to identify the
+ * beginning of each label range in the data.  `labels.size() == (offsets.size() - 1)`.
  *
- * If @p starting_vertex_labels is specified and @p label_to_output_comm_rank is specified then the
- * label output has values.  This will also result in the output being sorted by vertex label.  The
- * offsets array in the return will be a CSR-style offsets array to identify the beginning of each
- * label range in the data.  `labels.size() == (offsets.size() - 1)`.  Additionally, the data will
- * be shuffled so that all data with a particular label will be on the specified rank.
+ * If @p starting_vertex_label_offsets is specified and @p label_to_output_comm_rank is specified
+ * then the label output has values.  This will also result in the output being sorted by vertex
+ * label.  The offsets array in the return will be a CSR-style offsets array to identify the
+ * beginning of each label range in the data.  `labels.size() == (offsets.size() - 1)`.
+ * Additionally, the data will be shuffled so that all data with a particular label will be on the
+ * specified rank.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
@@ -83,8 +86,8 @@ enum class prior_sources_behavior_t { DEFAULT = 0, CARRY_OVER, EXCLUDE };
  * @param edge_type_view Optional view object holding edge types for @p graph_view.
  * @param starting_vertices Device span of starting vertex IDs for the sampling.
  * In a multi-gpu context the starting vertices should be local to this GPU.
- * @param starting_vertex_labels Optional device span of labels associted with each starting vertex
- * for the sampling.
+ * @param starting_vertex_label_offsets Optional device span of labels associated with each starting
+ * vertex for the sampling.
  * @param label_to_output_comm_rank Optional tuple of device spans mapping label to a particular
  * output rank.  Element 0 of the tuple identifes the label, Element 1 of the tuple identifies the
  * output rank.  The label span must be sorted in ascending order.
@@ -126,7 +129,7 @@ uniform_neighbor_sample(
   std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
   std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
   raft::device_span<vertex_t const> starting_vertices,
-  std::optional<raft::device_span<label_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<label_t const>> starting_vertex_label_offsets,
   std::optional<std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>>
     label_to_output_comm_rank,
   raft::host_span<int32_t const> fan_out,
@@ -140,6 +143,8 @@ uniform_neighbor_sample(
 /**
  * @brief Biased Neighborhood Sampling.
  *
+ * @deprecated Replaced with homogeneous_biased_neighbor_sample
+ *
  * This function traverses from a set of starting vertices, traversing outgoing edges and
  * randomly selects (with edge biases) from these outgoing neighbors to extract a subgraph.
  *
@@ -150,24 +155,26 @@ uniform_neighbor_sample(
  * encountered in.  The label output (optional) identifes the vertex label.  The offsets array
  * (optional) will be described below and is dependent upon the input parameters.
  *
- * If @p starting_vertex_labels is not specified then no organization is applied to the output, the
- * label and offsets values in the return set will be std::nullopt.
+ * If @p starting_vertex_label_offsets is not specified then no organization is applied to the
+ * output, the label and offsets values in the return set will be std::nullopt.
  *
- * If @p starting_vertex_labels is specified and @p label_to_output_comm_rank is not specified then
- * the label output has values.  This will also result in the output being sorted by vertex label.
- * The offsets array in the return will be a CSR-style offsets array to identify the beginning of
- * each label range in the data.  `labels.size() == (offsets.size() - 1)`.
+ * If @p starting_vertex_label_offsets is specified and @p label_to_output_comm_rank is not
+ * specified then the label output has values.  This will also result in the output being sorted by
+ * vertex label. The offsets array in the return will be a CSR-style offsets array to identify the
+ * beginning of each label range in the data.  `labels.size() == (offsets.size() - 1)`.
  *
- * If @p starting_vertex_labels is specified and @p label_to_output_comm_rank is specified then the
- * label output has values.  This will also result in the output being sorted by vertex label.  The
- * offsets array in the return will be a CSR-style offsets array to identify the beginning of each
- * label range in the data.  `labels.size() == (offsets.size() - 1)`.  Additionally, the data will
- * be shuffled so that all data with a particular label will be on the specified rank.
+ * If @p starting_vertex_label_offsets is specified and @p label_to_output_comm_rank is specified
+ * then the label output has values.  This will also result in the output being sorted by vertex
+ * label.  The offsets array in the return will be a CSR-style offsets array to identify the
+ * beginning of each label range in the data.  `labels.size() == (offsets.size() - 1)`.
+ * Additionally, the data will be shuffled so that all data with a particular label will be on the
+ * specified rank.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
  * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t Type of edge weights. Needs to be a floating point type.
  * @tparam edge_type_t Type of edge type. Needs to be an integral type.
+ * @tparam bias_t Type of bias. Needs to be an integral type.
  * @tparam label_t Type of label. Needs to be an integral type.
  * @tparam store_transposed Flag indicating whether sources (if false) or destinations (if
  * true) are major indices
@@ -184,8 +191,8 @@ uniform_neighbor_sample(
  * corresponding edge can never be selected.
  * @param starting_vertices Device span of starting vertex IDs for the sampling.
  * In a multi-gpu context the starting vertices should be local to this GPU.
- * @param starting_vertex_labels Optional device span of labels associted with each starting vertex
- * for the sampling.
+ * @param starting_vertex_label_offsets Optional device span of labels associated with each starting
+ * vertex for the sampling.
  * @param label_to_output_comm_rank Optional tuple of device spans mapping label to a particular
  * output rank.  Element 0 of the tuple identifes the label, Element 1 of the tuple identifies the
  * output rank.  The label span must be sorted in ascending order.
@@ -229,7 +236,7 @@ biased_neighbor_sample(
   std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
   edge_property_view_t<edge_t, bias_t const*> edge_bias_view,
   raft::device_span<vertex_t const> starting_vertices,
-  std::optional<raft::device_span<label_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<label_t const>> starting_vertex_label_offsets,
   std::optional<std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>>
     label_to_output_comm_rank,
   raft::host_span<int32_t const> fan_out,
@@ -240,6 +247,349 @@ biased_neighbor_sample(
   bool dedupe_sources                             = false,
   bool do_expensive_check                         = false);
 
+struct sampling_flags_t {
+  /**
+   * Specifies how to handle prior sources. Default is DEFAULT.
+   */
+  prior_sources_behavior_t prior_sources_behavior{};
+
+  /**
+   * Specifies if the hop information should be returned.  Default is false.
+   */
+  bool return_hops{false};
+
+  /**
+   * If true then if a vertex v appears as a destination in hop X multiple times
+   * with the same label, it will only be passed once (for each label) as a source
+   * for the next hop.  Default is false.
+   */
+  bool dedupe_sources{false};
+
+  /**
+   * Specifies if random sampling is done with replacement
+   *   (true) or without replacement (false).  Default is true.
+   */
+  bool with_replacement{true};
+};
+
+/**
+ * @brief Homogeneous Uniform Neighborhood Sampling.
+ *
+ * This function traverses from a set of starting vertices, traversing outgoing edges and
+ * randomly selects (uniformly) from these outgoing neighbors to extract a subgraph.
+ * The branching out to select outgoing neighbors is performed with homogeneous fanouts
+ *
+ * Output from this function is a tuple of vectors (src, dst, weight, edge_id, edge_type, hop,
+ * offsets), identifying the randomly selected edges where the size of src, dst, weight, edge_id,
+ * edge_type and hop is the number of sampled edges while the size of the offsets vector is the
+ * number of labels + 1.  src is the source vertex, dst is the destination vertex, weight
+ * (optional) is the edge weight, edge_id (optional) identifies the edge id, edge_type (optional)
+ * identifies the edge type, hop identifies which hop the edge was encountered in.
+ * The offsets array (optional) identifies the offset for each label.
+ *
+ * If @p label_to_output_comm_rank is specified then the data will be shuffled so that all entries
+ * for a particular label are returned on the specified rank.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam edge_type_t Type of edge type. Needs to be an integral type.
+ * @tparam store_transposed Flag indicating whether sources (if false) or destinations (if
+ * true) are major indices
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param rng_state A pre-initialized raft::RngState object for generating random numbers
+ * @param graph_view Graph View object to generate NBR Sampling on.
+ * @param edge_weight_view Optional view object holding edge weights for @p graph_view.
+ * @param edge_id_view Optional view object holding edge ids for @p graph_view.
+ * @param edge_type_view Optional view object holding edge types for @p graph_view.
+ * @param starting_vertices Device span of starting vertex IDs for the sampling.
+ * In a multi-gpu context the starting vertices should be local to this GPU.
+ * @param starting_vertex_label_offsets Optional device span of labels associated with each starting
+ * vertex for the sampling.
+ * @param label_to_output_comm_rank Optional device span identifying which rank should get sampling
+ * outputs of each vertex label.  This should be the same on each rank.
+ * @param fan_out Host span defining branching out (fan-out) degree per source vertex for each
+ * level.
+ * @param flags A set of flags indicating which sampling features should be used.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return tuple device vectors (vertex_t source_vertex, vertex_t destination_vertex,
+ * optional weight_t weight, optional edge_t edge id, optional edge_type_t edge type,
+ * optional int32_t hop, optional label_t label, optional size_t offsets)
+ */
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_label_offsets,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check = false);
+
+/**
+ * @brief Homogeneous Biased Neighborhood Sampling.
+ *
+ * This function traverses from a set of starting vertices, traversing outgoing edges and
+ * randomly selects (with edge biases) from these outgoing neighbors to extract a subgraph.
+ * The branching out to select outgoing neighbors is performed with homogeneous fanouts
+ *
+ * Output from this function is a tuple of vectors (src, dst, weight, edge_id, edge_type, hop,
+ * offsets), identifying the randomly selected edges where the size of src, dst, weight, edge_id,
+ * edge_type and hop is the number of sampled edges while the size of the offsets vector is the
+ * number of labels + 1.  src is the source vertex, dst is the destination vertex, weight
+ * (optional) is the edge weight, edge_id (optional) identifies the edge id, edge_type (optional)
+ * identifies the edge type, hop identifies which hop the edge was encountered in.
+ * The offsets array (optional) identifies the offset for each label.
+ *
+ * If @p label_to_output_comm_rank is specified then the data will be shuffled so that all entries
+ * for a particular label are returned on the specified rank.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam edge_type_t Type of edge type. Needs to be an integral type.
+ * @tparam bias_t Type of bias. Needs to be an integral type.
+ * @tparam store_transposed Flag indicating whether sources (if false) or destinations (if
+ * true) are major indices
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param rng_state A pre-initialized raft::RngState object for generating random numbers
+ * @param graph_view Graph View object to generate NBR Sampling on.
+ * @param edge_weight_view Optional view object holding edge weights for @p graph_view.
+ * @param edge_id_view Optional view object holding edge ids for @p graph_view.
+ * @param edge_type_view Optional view object holding edge types for @p graph_view.
+ * @param edge_bias_view View object holding edge biases (to be used in biased sampling) for @p
+ * graph_view. Bias values should be non-negative and the sum of edge bias values from any vertex
+ * should not exceed std::numeric_limits<bias_t>::max(). 0 bias value indicates that the
+ * corresponding edge can never be selected.
+ * @param starting_vertices Device span of starting vertex IDs for the sampling.
+ * In a multi-gpu context the starting vertices should be local to this GPU.
+ * @param starting_vertex_label_offsets Optional device span of labels associated with each starting
+ * vertex for the sampling.
+ * @param label_to_output_comm_rank Optional device span identifying which rank should get sampling
+ * outputs of each vertex label.  This should be the same on each rank.
+ * @param fan_out Host span defining branching out (fan-out) degree per source vertex for each
+ * level.
+ * @param flags A set of flags indicating which sampling features should be used.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return tuple device vectors (vertex_t source_vertex, vertex_t destination_vertex,
+ * optional weight_t weight, optional edge_t edge id, optional edge_type_t edge type,
+ * optional int32_t hop, optional label_t label, optional size_t offsets)
+ */
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          typename bias_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  edge_property_view_t<edge_t, bias_t const*> edge_bias_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_label_offsets,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check = false);
+
+/**
+ * @brief Heterogeneous Uniform Neighborhood Sampling.
+ *
+ * This function traverses from a set of starting vertices, traversing outgoing edges and
+ * randomly selects (uniformly) from these outgoing neighbors to extract a subgraph.
+ * The branching out to select outgoing neighbors is performed with heterogeneous fanouts
+ * where the number of edge types is bigger than 1.
+ *
+ * Output from this function is a tuple of vectors (src, dst, weight, edge_id, edge_type, hop,
+ * offsets), identifying the randomly selected edges where the size of src, dst, weight, edge_id,
+ * edge_type and hop is the number of sampled edges while the size of the offsets vector is the
+ * number of labels + 1.  src is the source vertex, dst is the destination vertex, weight
+ * (optional) is the edge weight, edge_id (optional) identifies the edge id, edge_type (optional)
+ * identifies the edge type, hop identifies which hop the edge was encountered in.
+ * The offsets array (optional) identifies the offset for each label.
+ *
+ * If @p label_to_output_comm_rank is specified then the data will be shuffled so that all entries
+ * for a particular label are returned on the specified rank.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam edge_type_t Type of edge type. Needs to be an integral type.
+ * @tparam store_transposed Flag indicating whether sources (if false) or destinations (if
+ * true) are major indices
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param rng_state A pre-initialized raft::RngState object for generating random numbers
+ * @param graph_view Graph View object to generate NBR Sampling on.
+ * @param edge_weight_view Optional view object holding edge weights for @p graph_view.
+ * @param edge_id_view Optional view object holding edge ids for @p graph_view.
+ * @param edge_type_view Optional view object holding edge types for @p graph_view.
+ * @param starting_vertices Device span of starting vertex IDs for the sampling.
+ * In a multi-gpu context the starting vertices should be local to this GPU.
+ * @param starting_vertex_label_offsets Optional device span of labels associated with each starting
+ * vertex for the sampling.
+ * @param label_to_output_comm_rank Optional device span identifying which rank should get sampling
+ * outputs of each vertex label.  This should be the same on each rank.
+ * @param fan_out Host span defining branching out (fan-out) degree per source vertex for each
+ * level. The fanout value at hop x is given by the expression 'fanout[x*num_edge_types +
+ * edge_type_id]'
+ * @param num_edge_types Number of edge types where a value of 1 translates to homogeneous neighbor
+ * sample whereas a value greater than 1 translates to heterogeneous neighbor sample.
+ * @param flags A set of flags indicating which sampling features should be used.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return tuple device vectors (vertex_t source_vertex, vertex_t destination_vertex,
+ * optional weight_t weight, optional edge_t edge id, optional edge_type_t edge type,
+ * optional int32_t hop, optional label_t label, optional size_t offsets)
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_label_offsets,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  edge_type_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check = false);
+
+/**
+ * @brief Heterogeneous Biased Neighborhood Sampling.
+ *
+ * This function traverses from a set of starting vertices, traversing outgoing edges and
+ * randomly selects (with edge biases) from these outgoing neighbors to extract a subgraph.
+ * The branching out to select outgoing neighbors is performed with heterogeneous fanouts
+ * where the number of edge types is bigger than 1.
+ *
+ * Output from this function is a tuple of vectors (src, dst, weight, edge_id, edge_type, hop,
+ * offsets), identifying the randomly selected edges where the size of src, dst, weight, edge_id,
+ * edge_type and hop is the number of sampled edges while the size of the offsets vector is the
+ * number of labels + 1.  src is the source vertex, dst is the destination vertex, weight
+ * (optional) is the edge weight, edge_id (optional) identifies the edge id, edge_type (optional)
+ * identifies the edge type, hop identifies which hop the edge was encountered in.
+ * The offsets array (optional) identifies the offset for each label.
+ *
+ * If @p label_to_output_comm_rank is specified then the data will be shuffled so that all entries
+ * for a particular label are returned on the specified rank.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam edge_type_t Type of edge type. Needs to be an integral type.
+ * @tparam bias_t Type of bias. Needs to be an integral type.
+ * @tparam store_transposed Flag indicating whether sources (if false) or destinations (if
+ * true) are major indices
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param rng_state A pre-initialized raft::RngState object for generating random numbers
+ * @param graph_view Graph View object to generate NBR Sampling on.
+ * @param edge_weight_view Optional view object holding edge weights for @p graph_view.
+ * @param edge_id_view Optional view object holding edge ids for @p graph_view.
+ * @param edge_type_view Optional view object holding edge types for @p graph_view.
+ * @param edge_bias_view View object holding edge biases (to be used in biased sampling) for @p
+ * graph_view. Bias values should be non-negative and the sum of edge bias values from any vertex
+ * should not exceed std::numeric_limits<bias_t>::max(). 0 bias value indicates that the
+ * corresponding edge can never be selected.
+ * @param starting_vertices Device span of starting vertex IDs for the sampling.
+ * In a multi-gpu context the starting vertices should be local to this GPU.
+ * @param starting_vertex_label_offsets Optional device span of labels associated with each starting
+ * vertex for the sampling.
+ * @param label_to_output_comm_rank Optional device span identifying which rank should get sampling
+ * outputs of each vertex label.  This should be the same on each rank.
+ * @param fan_out Host span defining branching out (fan-out) degree per source vertex for each
+ * level. The fanout value at hop x is given by the expression 'fanout[x*num_edge_types +
+ * edge_type_id]'
+ * @param num_edge_types Number of edge types where a value of 1 translates to homogeneous neighbor
+ * sample whereas a value greater than 1 translates to heterogeneous neighbor sample.
+ * @param flags A set of flags indicating which sampling features should be used.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return tuple device vectors (vertex_t source_vertex, vertex_t destination_vertex,
+ * optional weight_t weight, optional edge_t edge id, optional edge_type_t edge type,
+ * optional int32_t hop, optional label_t label, optional size_t offsets)
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          typename bias_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  edge_property_view_t<edge_t, bias_t const*> edge_bias_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_label_offsets,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  edge_type_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check = false);
+
 /*
  * @brief renumber sampled edge list and compress to the (D)CSR|(D)CSC format.
  *
diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h
index bb26e577915..ef75e726d80 100644
--- a/cpp/include/cugraph_c/sampling_algorithms.h
+++ b/cpp/include/cugraph_c/sampling_algorithms.h
@@ -199,6 +199,13 @@ typedef struct {
   int32_t align_;
 } cugraph_sampling_options_t;
 
+/**
+ * @brief     Opaque sampling options type
+ */
+typedef struct {
+  int32_t align_;
+} sampling_flags_t;
+
 /**
  * @brief     Enumeration for prior sources behavior
  */
@@ -323,6 +330,8 @@ void cugraph_sampling_options_free(cugraph_sampling_options_t* options);
 /**
  * @brief     Uniform Neighborhood Sampling
  *
+ * @deprecated  This API will be deleted, use cugraph_homogeneous_uniform_neighbor_sample
+ *
  * Returns a sample of the neighborhood around specified start vertices.  Optionally, each
  * start vertex can be associated with a label, allowing the caller to specify multiple batches
  * of sampling requests in the same function call - which should improve GPU utilization.
@@ -348,8 +357,8 @@ void cugraph_sampling_options_free(cugraph_sampling_options_t* options);
  * label_to_comm_rank[i].  If not specified then the output data will not be shuffled between ranks.
  * @param [in]  label_offsets Device array of the offsets for each label in the seed list.  This
  *                            parameter is only used with the retain_seeds option.
- * @param [in]  fanout       Host array defining the fan out at each step in the sampling algorithm.
- *                           We only support fanout values of type INT32
+ * @param [in]  fan_out       Host array defining the fan out at each step in the sampling
+ * algorithm. We only support fan_out values of type INT32
  * @param [in,out] rng_state State of the random number generator, updated with each call
  * @param [in]  sampling_options
  *                           Opaque pointer defining the sampling options.
@@ -378,6 +387,8 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
 /**
  * @brief     Biased Neighborhood Sampling
  *
+ * @deprecated  This API will be deleted, use cugraph_homogeneous_biased_neighbor_sample.
+ *
  * Returns a sample of the neighborhood around specified start vertices.  Optionally, each
  * start vertex can be associated with a label, allowing the caller to specify multiple batches
  * of sampling requests in the same function call - which should improve GPU utilization.
@@ -406,8 +417,8 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
  * label_to_comm_rank[i].  If not specified then the output data will not be shuffled between ranks.
  * @param [in]  label_offsets Device array of the offsets for each label in the seed list.  This
  *                            parameter is only used with the retain_seeds option.
- * @param [in]  fanout       Host array defining the fan out at each step in the sampling algorithm.
- *                           We only support fanout values of type INT32
+ * @param [in]  fan_out       Host array defining the fan out at each step in the sampling
+ * algorithm. We only support fan_out values of type INT32
  * @param [in,out] rng_state State of the random number generator, updated with each call
  * @param [in]  sampling_options
  *                           Opaque pointer defining the sampling options.
@@ -434,6 +445,186 @@ cugraph_error_code_t cugraph_biased_neighbor_sample(
   cugraph_sample_result_t** result,
   cugraph_error_t** error);
 
+/**
+ * @brief     Homogeneous Uniform Neighborhood Sampling
+ *
+ * Returns a sample of the neighborhood around specified start vertices and fan_out.
+ * The neighborhood is sampled uniformly.
+ * Optionally, each start vertex can be associated with a label, allowing the caller to specify
+ * multiple batches of sampling requests in the same function call - which should improve GPU
+ * utilization.
+ *
+ * If label is NULL then all start vertices will be considered part of the same batch and the
+ * return value will not have a label column.
+ *
+ * @param [in]  handle       Handle for accessing resources
+ *  * @param [in,out] rng_state State of the random number generator, updated with each call
+ * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
+ *                           needs to be transposed
+ * @param [in]  start_vertices Device array of start vertices for the sampling
+ * @param [in]  starting_vertex_label_offsets Device array of the offsets for each label in
+ * the seed list. This parameter is only used with the retain_seeds option.
+ * @param [in]  fan_out       Host array defining the fan out at each step in the sampling
+ * algorithm. We only support fan_out values of type INT32
+ * @param [in]  sampling_options
+ *                           Opaque pointer defining the sampling options.
+ * @param [in]  do_expensive_check
+ *                           A flag to run expensive checks for input arguments (if set to true)
+ * @param [out]  result      Output from the uniform_neighbor_sample call
+ * @param [out] error        Pointer to an error object storing details of any error.  Will
+ *                           be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_homogeneous_uniform_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error);
+
+/**
+ * @brief     Homogeneous Biased Neighborhood Sampling
+ *
+ * Returns a sample of the neighborhood around specified start vertices and fan_out.
+ * The neighborhood is sampled uniformly.
+ * Optionally, each start vertex can be associated with a label, allowing the caller to specify
+ * multiple batches of sampling requests in the same function call - which should improve GPU
+ * utilization.
+ *
+ * If label is NULL then all start vertices will be considered part of the same batch and the
+ * return value will not have a label column.
+ *
+ * @param [in]  handle       Handle for accessing resources
+ *  * @param [in,out] rng_state State of the random number generator, updated with each call
+ * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
+ *                           needs to be transposed
+ * @param [in]  edge_biases  Device array of edge biases to use for sampling.  If NULL
+ * use the edge weight as the bias. If set to NULL, edges will be sampled uniformly.
+ * @param [in]  start_vertices Device array of start vertices for the sampling
+ * @param [in]  starting_vertex_label_offsets Device array of the offsets for each label in
+ * the seed list. This parameter is only used with the retain_seeds option.
+ * @param [in]  fan_out       Host array defining the fan out at each step in the sampling
+ * algorithm. We only support fan_out values of type INT32
+ * @param [in]  sampling_options
+ *                           Opaque pointer defining the sampling options.
+ * @param [in]  do_expensive_check
+ *                           A flag to run expensive checks for input arguments (if set to true)
+ * @param [out]  result      Output from the uniform_neighbor_sample call
+ * @param [out] error        Pointer to an error object storing details of any error.  Will
+ *                           be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_homogeneous_biased_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_edge_property_view_t* edge_biases,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error);
+
+/**
+ * @brief     Heterogeneous Uniform Neighborhood Sampling
+ *
+ * Returns a sample of the neighborhood around specified start vertices and fan_out.
+ * The neighborhood is sampled uniformly.
+ * Optionally, each start vertex can be associated with a label, allowing the caller to specify
+ * multiple batches of sampling requests in the same function call - which should improve GPU
+ * utilization.
+ *
+ * If label is NULL then all start vertices will be considered part of the same batch and the
+ * return value will not have a label column.
+ *
+ * @param [in]  handle       Handle for accessing resources
+ *  * @param [in,out] rng_state State of the random number generator, updated with each call
+ * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
+ *                           needs to be transposed
+ * @param [in]  start_vertices Device array of start vertices for the sampling
+ * @param [in]  starting_vertex_label_offsets Device array of the offsets for each label in
+ * the seed list. This parameter is only used with the retain_seeds option.
+ * @param [in]  fan_out       Host array defining the fan out at each step in the sampling
+ * algorithm. We only support fan_out values of type INT32
+ * @param [in]  num_edge_types Number of edge types where a value of 1 translates to homogeneous
+ * neighbor sample whereas a value greater than 1 translates to heterogeneous neighbor sample.
+ * @param [in]  sampling_options
+ *                           Opaque pointer defining the sampling options.
+ * @param [in]  do_expensive_check
+ *                           A flag to run expensive checks for input arguments (if set to true)
+ * @param [out]  result      Output from the uniform_neighbor_sample call
+ * @param [out] error        Pointer to an error object storing details of any error.  Will
+ *                           be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_heterogeneous_uniform_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  int num_edge_types,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error);
+
+/**
+ * @brief     Heterogeneous Biased Neighborhood Sampling
+ *
+ * Returns a sample of the neighborhood around specified start vertices and fan_out.
+ * The neighborhood is sampled uniformly.
+ * Optionally, each start vertex can be associated with a label, allowing the caller to specify
+ * multiple batches of sampling requests in the same function call - which should improve GPU
+ * utilization.
+ *
+ * If label is NULL then all start vertices will be considered part of the same batch and the
+ * return value will not have a label column.
+ *
+ * @param [in]  handle       Handle for accessing resources
+ *  * @param [in,out] rng_state State of the random number generator, updated with each call
+ * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
+ *                           needs to be transposed
+ * @param [in]  edge_biases  Device array of edge biases to use for sampling.  If NULL
+ * use the edge weight as the bias. If set to NULL, edges will be sampled uniformly.
+ * @param [in]  start_vertices Device array of start vertices for the sampling
+ * @param [in]  starting_vertex_label_offsets Device array of the offsets for each label in
+ * the seed list. This parameter is only used with the retain_seeds option.
+ * @param [in]  fan_out       Host array defining the fan out at each step in the sampling
+ * algorithm. We only support fan_out values of type INT32
+ * @param [in]  num_edge_types Number of edge types where a value of 1 translates to homogeneous
+ * neighbor sample whereas a value greater than 1 translates to heterogeneous neighbor sample.
+ * @param [in]  sampling_options
+ *                           Opaque pointer defining the sampling options.
+ * @param [in]  do_expensive_check
+ *                           A flag to run expensive checks for input arguments (if set to true)
+ * @param [out]  result      Output from the uniform_neighbor_sample call
+ * @param [out] error        Pointer to an error object storing details of any error.  Will
+ *                           be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_heterogeneous_biased_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_edge_property_view_t* edge_biases,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  int num_edge_types,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error);
+
 /**
  * @deprecated This call should be replaced with cugraph_sample_result_get_majors
  * @brief     Get the source vertices from the sampling algorithm result
@@ -584,6 +775,26 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_renumber_map(
 cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_renumber_map_offsets(
   const cugraph_sample_result_t* result);
 
+/**
+ * @ingroup samplingC
+ * @brief     Get the edge renumber map
+ *
+ * @param [in]   result   The result from a sampling algorithm
+ * @return type erased array pointing to the renumber map
+ */
+cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_edge_renumber_map(
+  const cugraph_sample_result_t* result);
+
+/**
+ * @ingroup samplingC
+ * @brief     Get the edge renumber map offets
+ *
+ * @param [in]   result   The result from a sampling algorithm
+ * @return type erased array pointing to the renumber map
+ */
+cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_edge_renumber_map_offsets(
+  const cugraph_sample_result_t* result);
+
 /**
  * @ingroup samplingC
  * @brief     Free a sampling result
diff --git a/cpp/src/c_api/array.hpp b/cpp/src/c_api/array.hpp
index 048d2ee1cea..0ab30a1cb72 100644
--- a/cpp/src/c_api/array.hpp
+++ b/cpp/src/c_api/array.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -125,6 +125,27 @@ struct cugraph_type_erased_host_array_t {
     std::copy(vec.begin(), vec.end(), reinterpret_cast<T*>(data_.get()));
   }
 
+  cugraph_type_erased_host_array_t(cugraph_type_erased_host_array_view_t const* view_p)
+    : data_(std::make_unique<std::byte[]>(view_p->num_bytes_)),
+      size_(view_p->size_),
+      num_bytes_(view_p->num_bytes_),
+      type_(view_p->type_)
+  {
+    std::copy(view_p->data_, view_p->data_ + num_bytes_, data_.get());
+  }
+
+  template <typename T>
+  T* as_type()
+  {
+    return reinterpret_cast<T*>(data_.get());
+  }
+
+  template <typename T>
+  T const* as_type() const
+  {
+    return reinterpret_cast<T const*>(data_.get());
+  }
+
   auto view()
   {
     return new cugraph_type_erased_host_array_view_t{data_.get(), size_, num_bytes_, type_};
diff --git a/cpp/src/c_api/graph_functions.cpp b/cpp/src/c_api/graph_functions.cpp
index df741a349d2..8778369dbe6 100644
--- a/cpp/src/c_api/graph_functions.cpp
+++ b/cpp/src/c_api/graph_functions.cpp
@@ -84,7 +84,7 @@ struct create_vertex_pairs_functor : public cugraph::c_api::abstract_functor {
                               std::nullopt,
                               std::nullopt);
       }
-
+      // FIXME: use std::tuple (template) instead.
       result_ = new cugraph::c_api::cugraph_vertex_pairs_t{
         new cugraph::c_api::cugraph_type_erased_device_array_t(first_copy, graph_->vertex_type_),
         new cugraph::c_api::cugraph_type_erased_device_array_t(second_copy, graph_->vertex_type_)};
diff --git a/cpp/src/c_api/neighbor_sampling.cpp b/cpp/src/c_api/neighbor_sampling.cpp
index 69306806030..be3a44d813a 100644
--- a/cpp/src/c_api/neighbor_sampling.cpp
+++ b/cpp/src/c_api/neighbor_sampling.cpp
@@ -16,12 +16,15 @@
 
 #include "c_api/abstract_functor.hpp"
 #include "c_api/graph.hpp"
+#include "c_api/graph_helper.hpp"
 #include "c_api/properties.hpp"
 #include "c_api/random.hpp"
 #include "c_api/resource_handle.hpp"
 #include "c_api/utils.hpp"
+#include "sampling/detail/sampling_utils.hpp"
 
 #include <cugraph_c/algorithms.h>
+#include <cugraph_c/sampling_algorithms.h>
 
 #include <cugraph/algorithms.hpp>
 #include <cugraph/detail/shuffle_wrappers.hpp>
@@ -44,6 +47,13 @@ struct cugraph_sampling_options_t {
   bool_t retain_seeds_{FALSE};
 };
 
+struct sampling_flags_t {
+  prior_sources_behavior_t prior_sources_behavior_{prior_sources_behavior_t::DEFAULT};
+  bool_t return_hops_{FALSE};
+  bool_t dedupe_sources_{FALSE};
+  bool_t with_replacement_{FALSE};
+};
+
 struct cugraph_sample_result_t {
   cugraph_type_erased_device_array_t* major_offsets_{nullptr};
   cugraph_type_erased_device_array_t* majors_{nullptr};
@@ -56,6 +66,8 @@ struct cugraph_sample_result_t {
   cugraph_type_erased_device_array_t* label_{nullptr};
   cugraph_type_erased_device_array_t* renumber_map_{nullptr};
   cugraph_type_erased_device_array_t* renumber_map_offsets_{nullptr};
+  cugraph_type_erased_device_array_t* edge_renumber_map_{nullptr};
+  cugraph_type_erased_device_array_t* edge_renumber_map_offsets_{nullptr};
 };
 
 }  // namespace c_api
@@ -63,6 +75,7 @@ struct cugraph_sample_result_t {
 
 namespace {
 
+// Deprecated functor
 struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_functor {
   raft::handle_t const& handle_;
   cugraph::c_api::cugraph_graph_t* graph_{nullptr};
@@ -398,11 +411,14 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
                        : nullptr,
         (renumber_map_offsets) ? new cugraph::c_api::cugraph_type_erased_device_array_t(
                                    renumber_map_offsets.value(), SIZE_T)
-                               : nullptr};
+                               : nullptr,
+        nullptr,
+        nullptr};
     }
   }
 };
 
+// Deprecated functor
 struct biased_neighbor_sampling_functor : public cugraph::c_api::abstract_functor {
   raft::handle_t const& handle_;
   cugraph::c_api::cugraph_graph_t* graph_{nullptr};
@@ -748,7 +764,598 @@ struct biased_neighbor_sampling_functor : public cugraph::c_api::abstract_functo
                        : nullptr,
         (renumber_map_offsets) ? new cugraph::c_api::cugraph_type_erased_device_array_t(
                                    renumber_map_offsets.value(), SIZE_T)
-                               : nullptr};
+                               : nullptr,
+        nullptr,
+        nullptr};
+    }
+  }
+};
+
+struct neighbor_sampling_functor : public cugraph::c_api::abstract_functor {
+  raft::handle_t const& handle_;
+  cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr};
+  cugraph::c_api::cugraph_graph_t* graph_{nullptr};
+  cugraph::c_api::cugraph_edge_property_view_t const* edge_biases_{nullptr};
+  cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_vertices_{nullptr};
+  cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_vertex_offsets_{nullptr};
+  cugraph::c_api::cugraph_type_erased_host_array_view_t const* fan_out_{nullptr};
+  int num_edge_types_{};
+  cugraph::c_api::cugraph_sampling_options_t options_{};
+  bool is_biased_{false};
+  bool do_expensive_check_{false};
+  cugraph::c_api::cugraph_sample_result_t* result_{nullptr};
+
+  neighbor_sampling_functor(cugraph_resource_handle_t const* handle,
+                            cugraph_rng_state_t* rng_state,
+                            cugraph_graph_t* graph,
+                            cugraph_edge_property_view_t const* edge_biases,
+                            cugraph_type_erased_device_array_view_t const* start_vertices,
+                            cugraph_type_erased_device_array_view_t const* start_vertex_offsets,
+                            cugraph_type_erased_host_array_view_t const* fan_out,
+                            int num_edge_types,
+                            cugraph::c_api::cugraph_sampling_options_t options,
+                            bool is_biased,
+                            bool do_expensive_check)
+    : abstract_functor(),
+      handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
+      rng_state_(reinterpret_cast<cugraph::c_api::cugraph_rng_state_t*>(rng_state)),
+      graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
+      edge_biases_(
+        reinterpret_cast<cugraph::c_api::cugraph_edge_property_view_t const*>(edge_biases)),
+      start_vertices_(
+        reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+          start_vertices)),
+      start_vertex_offsets_(
+        reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+          start_vertex_offsets)),
+      fan_out_(
+        reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)),
+      num_edge_types_(num_edge_types),
+      options_(options),
+      is_biased_(is_biased),
+      do_expensive_check_(do_expensive_check)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename edge_type_t,
+            bool store_transposed,
+            bool multi_gpu>
+  void operator()()
+  {
+    using label_t = int32_t;
+
+    // FIXME: Think about how to handle SG vice MG
+    if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
+      unsupported();
+    } else {
+      // uniform_nbr_sample expects store_transposed == false
+      if constexpr (store_transposed) {
+        error_code_ = cugraph::c_api::
+          transpose_storage<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+            handle_, graph_, error_.get());
+        if (error_code_ != CUGRAPH_SUCCESS) return;
+      }
+
+      auto graph =
+        reinterpret_cast<cugraph::graph_t<vertex_t, edge_t, false, multi_gpu>*>(graph_->graph_);
+
+      auto graph_view = graph->view();
+
+      auto edge_weights = reinterpret_cast<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+                                 weight_t>*>(graph_->edge_weights_);
+
+      auto edge_ids = reinterpret_cast<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+                                 edge_t>*>(graph_->edge_ids_);
+
+      auto edge_types = reinterpret_cast<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+                                 edge_type_t>*>(graph_->edge_types_);
+
+      auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
+
+      auto edge_biases =
+        edge_biases_ ? reinterpret_cast<cugraph::edge_property_view_t<edge_t, weight_t const*>*>(
+                         edge_biases_->edge_property_)
+                     : nullptr;
+
+      rmm::device_uvector<vertex_t> start_vertices(start_vertices_->size_, handle_.get_stream());
+      raft::copy(start_vertices.data(),
+                 start_vertices_->as_type<vertex_t>(),
+                 start_vertices.size(),
+                 handle_.get_stream());
+
+      std::optional<rmm::device_uvector<label_t>> start_vertex_labels{std::nullopt};
+      std::optional<rmm::device_uvector<label_t>> local_label_to_comm_rank{std::nullopt};
+      std::optional<rmm::device_uvector<label_t>> label_to_comm_rank{
+        std::nullopt};  // global after allgatherv
+
+      std::optional<rmm::device_uvector<edge_t>> renumbered_and_sorted_edge_id_renumber_map(
+        std::nullopt);
+      std::optional<rmm::device_uvector<size_t>>
+        renumbered_and_sorted_edge_id_renumber_map_label_type_offsets(std::nullopt);
+
+      if (start_vertex_offsets_ != nullptr) {
+        // Retrieve the start_vertex_labels
+        start_vertex_labels = cugraph::detail::convert_starting_vertex_label_offsets_to_labels(
+          handle_,
+          raft::device_span<size_t const>{start_vertex_offsets_->as_type<size_t>(),
+                                          start_vertex_offsets_->size_});
+
+        // Get the number of labels on each GPU
+
+        if constexpr (multi_gpu) {
+          auto num_local_labels = start_vertex_offsets_->size_ - 1;
+
+          auto global_labels = cugraph::host_scalar_allgather(
+            handle_.get_comms(), num_local_labels, handle_.get_stream());
+
+          std::exclusive_scan(
+            global_labels.begin(), global_labels.end(), global_labels.begin(), label_t{0});
+
+          // Compute the global start_vertex_label_offsets
+
+          cugraph::detail::transform_increment_ints(
+            raft::device_span<label_t>{(*start_vertex_labels).data(),
+                                       (*start_vertex_labels).size()},
+            (label_t)global_labels[handle_.get_comms().get_rank()],
+            handle_.get_stream());
+
+          rmm::device_uvector<label_t> unique_labels((*start_vertex_labels).size(),
+                                                     handle_.get_stream());
+          raft::copy(unique_labels.data(),
+                     (*start_vertex_labels).data(),
+                     unique_labels.size(),
+                     handle_.get_stream());
+
+          // Get unique labels
+          // sort the start_vertex_labels
+          cugraph::detail::sort_ints(
+            handle_.get_stream(),
+            raft::device_span<label_t>{unique_labels.data(), unique_labels.size()});
+
+          auto num_unique_labels = cugraph::detail::unique_ints(
+            handle_.get_stream(),
+            raft::device_span<label_t>{unique_labels.data(), unique_labels.size()});
+
+          (*local_label_to_comm_rank).resize(num_unique_labels, handle_.get_stream());
+
+          cugraph::detail::scalar_fill(
+            handle_.get_stream(),
+            (*local_label_to_comm_rank).begin(),  // This should be rename to rank
+            (*local_label_to_comm_rank).size(),
+            label_t{handle_.get_comms().get_rank()});
+
+          // Perform allgather to get global_label_to_comm_rank_d_vector
+          auto recvcounts = cugraph::host_scalar_allgather(
+            handle_.get_comms(), num_unique_labels, handle_.get_stream());
+
+          std::vector<size_t> displacements(recvcounts.size());
+          std::exclusive_scan(
+            recvcounts.begin(), recvcounts.end(), displacements.begin(), size_t{0});
+
+          (*label_to_comm_rank)
+            .resize(displacements.back() + recvcounts.back(), handle_.get_stream());
+
+          cugraph::device_allgatherv(handle_.get_comms(),
+                                     (*local_label_to_comm_rank).begin(),
+                                     (*label_to_comm_rank).begin(),
+                                     recvcounts,
+                                     displacements,
+                                     handle_.get_stream());
+
+          std::tie(start_vertices, *start_vertex_labels) =
+            cugraph::detail::shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+              handle_, std::move(start_vertices), std::move(*start_vertex_labels));
+        }
+      } else {
+        if constexpr (multi_gpu) {
+          start_vertices =
+            cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
+              handle_, std::move(start_vertices));
+        }
+      }
+      //
+      // Need to renumber start_vertices
+      //
+      cugraph::renumber_local_ext_vertices<vertex_t, multi_gpu>(
+        handle_,
+        start_vertices.data(),
+        start_vertices.size(),
+        number_map->data(),
+        graph_view.local_vertex_partition_range_first(),
+        graph_view.local_vertex_partition_range_last(),
+        do_expensive_check_);
+
+      rmm::device_uvector<vertex_t> src(0, handle_.get_stream());
+      rmm::device_uvector<vertex_t> dst(0, handle_.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt{std::nullopt};
+      std::optional<rmm::device_uvector<edge_t>> edge_id{std::nullopt};
+      std::optional<rmm::device_uvector<edge_type_t>> edge_type{std::nullopt};
+      std::optional<rmm::device_uvector<int32_t>> hop{std::nullopt};
+      std::optional<rmm::device_uvector<label_t>> edge_label{std::nullopt};
+      std::optional<rmm::device_uvector<size_t>> offsets{std::nullopt};
+
+      // FIXME: For biased sampling, the user should pass either biases or edge weights,
+      // otherwised throw an error and suggest the user to call uniform neighbor sample instead
+
+      if (num_edge_types_ > 1) {
+        // call heterogeneous neighbor sample
+        if (is_biased_) {
+          std::tie(src, dst, wgt, edge_id, edge_type, hop, offsets) =
+            cugraph::heterogeneous_biased_neighbor_sample(
+              handle_,
+              rng_state_->rng_state_,
+              graph_view,
+              (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
+              (edge_ids != nullptr) ? std::make_optional(edge_ids->view()) : std::nullopt,
+              (edge_types != nullptr) ? std::make_optional(edge_types->view()) : std::nullopt,
+              (edge_biases != nullptr) ? *edge_biases : edge_weights->view(),
+              raft::device_span<vertex_t const>{start_vertices.data(), start_vertices.size()},
+              (start_vertex_offsets_ != nullptr)
+                ? std::make_optional<raft::device_span<int const>>((*start_vertex_labels).data(),
+                                                                   (*start_vertex_labels).size())
+                : std::nullopt,
+              label_to_comm_rank ? std::make_optional(raft::device_span<int const>{
+                                     (*label_to_comm_rank).data(), (*label_to_comm_rank).size()})
+                                 : std::nullopt,
+              raft::host_span<const int>(fan_out_->as_type<const int>(), fan_out_->size_),
+              num_edge_types_,
+              cugraph::sampling_flags_t{options_.prior_sources_behavior_,
+                                        options_.return_hops_,
+                                        options_.dedupe_sources_,
+                                        options_.with_replacement_},
+              do_expensive_check_);
+        } else {
+          std::tie(src, dst, wgt, edge_id, edge_type, hop, offsets) =
+            cugraph::heterogeneous_uniform_neighbor_sample(
+              handle_,
+              rng_state_->rng_state_,
+              graph_view,
+              (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
+              (edge_ids != nullptr) ? std::make_optional(edge_ids->view()) : std::nullopt,
+              (edge_types != nullptr) ? std::make_optional(edge_types->view()) : std::nullopt,
+              raft::device_span<vertex_t const>{start_vertices.data(), start_vertices.size()},
+              (start_vertex_offsets_ != nullptr)
+                ? std::make_optional<raft::device_span<int const>>((*start_vertex_labels).data(),
+                                                                   (*start_vertex_labels).size())
+                : std::nullopt,
+              label_to_comm_rank ? std::make_optional(raft::device_span<int const>{
+                                     (*label_to_comm_rank).data(), (*label_to_comm_rank).size()})
+                                 : std::nullopt,
+              raft::host_span<const int>(fan_out_->as_type<const int>(), fan_out_->size_),
+              num_edge_types_,
+              cugraph::sampling_flags_t{options_.prior_sources_behavior_,
+                                        options_.return_hops_,
+                                        options_.dedupe_sources_,
+                                        options_.with_replacement_},
+              do_expensive_check_);
+        }
+      } else {
+        // Call homogeneous neighbor sample
+        if (is_biased_) {
+          std::tie(src, dst, wgt, edge_id, edge_type, hop, offsets) =
+            cugraph::homogeneous_biased_neighbor_sample(
+              handle_,
+              rng_state_->rng_state_,
+              graph_view,
+              (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
+              (edge_ids != nullptr) ? std::make_optional(edge_ids->view()) : std::nullopt,
+              (edge_types != nullptr) ? std::make_optional(edge_types->view()) : std::nullopt,
+              (edge_biases != nullptr) ? *edge_biases : edge_weights->view(),
+              raft::device_span<vertex_t const>{start_vertices.data(), start_vertices.size()},
+              (start_vertex_offsets_ != nullptr)
+                ? std::make_optional<raft::device_span<int const>>((*start_vertex_labels).data(),
+                                                                   (*start_vertex_labels).size())
+                : std::nullopt,
+              label_to_comm_rank ? std::make_optional(raft::device_span<int const>{
+                                     (*label_to_comm_rank).data(), (*label_to_comm_rank).size()})
+                                 : std::nullopt,
+              raft::host_span<const int>(fan_out_->as_type<const int>(), fan_out_->size_),
+              cugraph::sampling_flags_t{options_.prior_sources_behavior_,
+                                        options_.return_hops_,
+                                        options_.dedupe_sources_,
+                                        options_.with_replacement_},
+              do_expensive_check_);
+        } else {
+          std::tie(src, dst, wgt, edge_id, edge_type, hop, offsets) =
+            cugraph::homogeneous_uniform_neighbor_sample(
+              handle_,
+              rng_state_->rng_state_,
+              graph_view,
+              (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
+              (edge_ids != nullptr) ? std::make_optional(edge_ids->view()) : std::nullopt,
+              (edge_types != nullptr) ? std::make_optional(edge_types->view()) : std::nullopt,
+              raft::device_span<vertex_t const>{start_vertices.data(), start_vertices.size()},
+              (start_vertex_offsets_ != nullptr)
+                ? std::make_optional<raft::device_span<int const>>((*start_vertex_labels).data(),
+                                                                   (*start_vertex_labels).size())
+                : std::nullopt,
+              label_to_comm_rank ? std::make_optional(raft::device_span<int const>{
+                                     (*label_to_comm_rank).data(), (*label_to_comm_rank).size()})
+                                 : std::nullopt,
+              raft::host_span<const int>(fan_out_->as_type<const int>(), fan_out_->size_),
+              cugraph::sampling_flags_t{options_.prior_sources_behavior_,
+                                        options_.return_hops_,
+                                        options_.dedupe_sources_,
+                                        options_.with_replacement_},
+              do_expensive_check_);
+        }
+      }
+
+      std::vector<vertex_t> vertex_partition_lasts = graph_view.vertex_partition_range_lasts();
+
+      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle_,
+                                                            src.data(),
+                                                            src.size(),
+                                                            number_map->data(),
+                                                            vertex_partition_lasts,
+                                                            do_expensive_check_);
+
+      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(handle_,
+                                                            dst.data(),
+                                                            dst.size(),
+                                                            number_map->data(),
+                                                            vertex_partition_lasts,
+                                                            do_expensive_check_);
+
+      std::optional<rmm::device_uvector<vertex_t>> majors{std::nullopt};
+      rmm::device_uvector<vertex_t> minors(0, handle_.get_stream());
+      std::optional<rmm::device_uvector<size_t>> major_offsets{std::nullopt};
+
+      std::optional<rmm::device_uvector<size_t>> label_hop_offsets{std::nullopt};
+
+      std::optional<rmm::device_uvector<vertex_t>> renumber_map{std::nullopt};
+      std::optional<rmm::device_uvector<size_t>> renumber_map_offsets{std::nullopt};
+
+      bool src_is_major = (options_.compression_type_ == cugraph_compression_type_t::CSR) ||
+                          (options_.compression_type_ == cugraph_compression_type_t::DCSR) ||
+                          (options_.compression_type_ == cugraph_compression_type_t::COO);
+
+      // Extract the edge_label from the offsets
+      if (offsets) {
+        edge_label = cugraph::c_api::expand_sparse_offsets(
+          raft::device_span<size_t const>{(*offsets).data(), (*offsets).size()},
+          label_t{0},
+          handle_.get_stream());
+      }
+
+      if (options_.renumber_results_) {
+        if (num_edge_types_ == 1) {  // homogeneous renumbering
+          if (options_.compression_type_ == cugraph_compression_type_t::COO) {
+            // COO
+
+            rmm::device_uvector<vertex_t> output_majors(0, handle_.get_stream());
+            rmm::device_uvector<vertex_t> output_renumber_map(0, handle_.get_stream());
+            std::tie(output_majors,
+                     minors,
+                     wgt,
+                     edge_id,
+                     edge_type,
+                     label_hop_offsets,
+                     output_renumber_map,
+                     renumber_map_offsets) =
+              cugraph::renumber_and_sort_sampled_edgelist<vertex_t>(
+                handle_,
+                std::move(src),
+                std::move(dst),
+                std::move(wgt),
+                std::move(edge_id),
+                std::move(edge_type),
+                std::move(hop),
+                options_.retain_seeds_
+                  ? std::make_optional(raft::device_span<vertex_t const>{
+                      start_vertices_->as_type<vertex_t>(), start_vertices_->size_})
+                  : std::nullopt,
+                options_.retain_seeds_
+                  ? std::make_optional(raft::device_span<size_t const>{
+                      start_vertex_offsets_->as_type<size_t>(), start_vertex_offsets_->size_})
+                  : std::nullopt,
+                offsets ? std::make_optional(
+                            raft::device_span<size_t const>{offsets->data(), offsets->size()})
+                        : std::nullopt,
+                offsets ? (*offsets).size() - 1 : size_t{1},
+                hop ? fan_out_->size_ : size_t{1},
+                src_is_major,
+                do_expensive_check_);
+
+            majors.emplace(std::move(output_majors));
+            renumber_map.emplace(std::move(output_renumber_map));
+          } else {
+            // (D)CSC, (D)CSR
+
+            bool doubly_compress =
+              (options_.compression_type_ == cugraph_compression_type_t::DCSR) ||
+              (options_.compression_type_ == cugraph_compression_type_t::DCSC);
+
+            rmm::device_uvector<size_t> output_major_offsets(0, handle_.get_stream());
+            rmm::device_uvector<vertex_t> output_renumber_map(0, handle_.get_stream());
+
+            std::tie(majors,
+                     output_major_offsets,
+                     minors,
+                     wgt,
+                     edge_id,
+                     edge_type,
+                     label_hop_offsets,
+                     output_renumber_map,
+                     renumber_map_offsets) =
+              cugraph::renumber_and_compress_sampled_edgelist<vertex_t>(
+                handle_,
+                std::move(src),
+                std::move(dst),
+                std::move(wgt),
+                std::move(edge_id),
+                std::move(edge_type),
+                std::move(hop),
+                options_.retain_seeds_
+                  ? std::make_optional(raft::device_span<vertex_t const>{
+                      start_vertices_->as_type<vertex_t>(), start_vertices_->size_})
+                  : std::nullopt,
+                options_.retain_seeds_
+                  ? std::make_optional(raft::device_span<size_t const>{
+                      start_vertex_offsets_->as_type<size_t>(), start_vertex_offsets_->size_})
+                  : std::nullopt,
+                offsets ? std::make_optional(
+                            raft::device_span<size_t const>{offsets->data(), offsets->size()})
+                        : std::nullopt,
+                edge_label ? (*offsets).size() - 1 : size_t{1},  // FIXME: update edge_label
+                hop ? fan_out_->size_ : size_t{1},
+                src_is_major,
+                options_.compress_per_hop_,
+                doubly_compress,
+                do_expensive_check_);
+
+            renumber_map.emplace(std::move(output_renumber_map));
+            major_offsets.emplace(std::move(output_major_offsets));
+          }
+
+          // These are now represented by label_hop_offsets
+          hop.reset();
+          offsets.reset();
+
+        } else {  // heterogeneous renumbering
+
+          rmm::device_uvector<vertex_t> vertex_type_offsets(
+            graph_view.local_vertex_partition_range_size(), handle_.get_stream());
+
+          cugraph::detail::sequence_fill(handle_.get_stream(),
+                                         vertex_type_offsets.begin(),
+                                         vertex_type_offsets.size(),
+                                         vertex_t{0}  // FIXME: Update array
+          );
+
+          rmm::device_uvector<vertex_t> output_majors(0, handle_.get_stream());
+          rmm::device_uvector<vertex_t> output_renumber_map(0, handle_.get_stream());
+
+          // extract the edge_type from label_type_hop_offsets
+          std::optional<rmm::device_uvector<size_t>> label_type_hop_offsets{std::nullopt};
+          std::tie(output_majors,
+                   minors,
+                   wgt,
+                   edge_id,
+                   label_type_hop_offsets,  // Contains information about the type and hop offsets
+                   output_renumber_map,
+                   (*renumber_map_offsets),
+                   renumbered_and_sorted_edge_id_renumber_map,
+                   renumbered_and_sorted_edge_id_renumber_map_label_type_offsets) =
+            cugraph::heterogeneous_renumber_and_sort_sampled_edgelist<vertex_t>(
+              handle_,
+              std::move(src),
+              std::move(dst),
+              std::move(wgt),
+              std::move(edge_id),
+              std::move(edge_type),
+              std::move(hop),
+              options_.retain_seeds_
+                ? std::make_optional(raft::device_span<vertex_t const>{
+                    start_vertices_->as_type<vertex_t>(), start_vertices_->size_})
+                : std::nullopt,
+              options_.retain_seeds_
+                ? std::make_optional(raft::device_span<size_t const>{
+                    start_vertex_offsets_->as_type<size_t>(), start_vertex_offsets_->size_})
+                : std::nullopt,
+              offsets ? std::make_optional(
+                          raft::device_span<size_t const>{offsets->data(), offsets->size()})
+                      : std::nullopt,
+              raft::device_span<vertex_t const>{vertex_type_offsets.data(),
+                                                vertex_type_offsets.size()},
+
+              edge_label ? (*offsets).size() - 1 : size_t{1},
+              hop ? fan_out_->size_ : size_t{1},
+              size_t{1},
+              num_edge_types_,
+              src_is_major,
+              do_expensive_check_);
+          if (edge_type) {
+            (*edge_type)
+              .resize(raft::device_span<size_t const>{(*label_type_hop_offsets).data(),
+                                                      (*label_type_hop_offsets).size()}
+                          .back() -
+                        1,
+                      handle_.get_stream());
+            cugraph::detail::sequence_fill(
+              handle_.get_stream(), (*edge_type).begin(), (*edge_type).size(), edge_type_t{0});
+          }
+
+          majors.emplace(std::move(output_majors));
+          // FIXME: Need to update renumber_map because default values are being passed
+          renumber_map.emplace(std::move(output_renumber_map));
+        }
+
+      } else {
+        if (options_.compression_type_ != cugraph_compression_type_t::COO) {
+          CUGRAPH_FAIL("Can only use COO format if not renumbering");
+        }
+
+        std::tie(src, dst, wgt, edge_id, edge_type, label_hop_offsets) =
+          cugraph::sort_sampled_edgelist(handle_,
+                                         std::move(src),
+                                         std::move(dst),
+                                         std::move(wgt),
+                                         std::move(edge_id),
+                                         std::move(edge_type),
+                                         std::move(hop),
+                                         offsets
+                                           ? std::make_optional(raft::device_span<size_t const>{
+                                               offsets->data(), offsets->size()})
+                                           : std::nullopt,
+                                         // derive label size from offset size instead of performing
+                                         // thrust::unique on edge_label.
+                                         edge_label ? (*offsets).size() - 1 : size_t{1},
+                                         hop ? fan_out_->size_ : size_t{1},
+                                         src_is_major,
+                                         do_expensive_check_);
+
+        majors.emplace(std::move(src));
+        minors = std::move(dst);
+
+        hop.reset();
+        offsets.reset();
+      }
+
+      result_ = new cugraph::c_api::cugraph_sample_result_t{
+        (major_offsets)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*major_offsets, SIZE_T)
+          : nullptr,
+        (majors)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*majors, graph_->vertex_type_)
+          : nullptr,
+        new cugraph::c_api::cugraph_type_erased_device_array_t(minors, graph_->vertex_type_),
+        (edge_id)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*edge_id, graph_->edge_type_)
+          : nullptr,
+        (edge_type) ? new cugraph::c_api::cugraph_type_erased_device_array_t(
+                        *edge_type, graph_->edge_type_id_type_)
+                    : nullptr,
+        (wgt) ? new cugraph::c_api::cugraph_type_erased_device_array_t(*wgt, graph_->weight_type_)
+              : nullptr,
+        (hop) ? new cugraph::c_api::cugraph_type_erased_device_array_t(*hop, INT32)
+              : nullptr,  // FIXME get rid of this
+        (label_hop_offsets)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*label_hop_offsets, SIZE_T)
+          : nullptr,
+        (edge_label)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(edge_label.value(), INT32)
+          : nullptr,
+        (renumber_map) ? new cugraph::c_api::cugraph_type_erased_device_array_t(
+                           renumber_map.value(), graph_->vertex_type_)
+                       : nullptr,
+        (renumber_map_offsets) ? new cugraph::c_api::cugraph_type_erased_device_array_t(
+                                   renumber_map_offsets.value(), SIZE_T)
+                               : nullptr,
+        (renumbered_and_sorted_edge_id_renumber_map)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(
+              renumbered_and_sorted_edge_id_renumber_map.value(), graph_->edge_type_)
+          : nullptr,
+        (renumbered_and_sorted_edge_id_renumber_map_label_type_offsets)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(
+              renumbered_and_sorted_edge_id_renumber_map_label_type_offsets.value(), SIZE_T)
+          : nullptr};
     }
   }
 };
@@ -985,6 +1592,26 @@ extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_re
                internal_pointer->renumber_map_offsets_->view());
 }
 
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_edge_renumber_map(
+  const cugraph_sample_result_t* result)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sample_result_t const*>(result);
+  return internal_pointer->renumber_map_ == nullptr
+           ? NULL
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->edge_renumber_map_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t*
+cugraph_sample_result_get_edge_renumber_map_offsets(const cugraph_sample_result_t* result)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sample_result_t const*>(result);
+  return internal_pointer->renumber_map_ == nullptr
+           ? NULL
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->edge_renumber_map_offsets_->view());
+}
+
 extern "C" cugraph_error_code_t cugraph_test_uniform_neighborhood_sample_result_create(
   const cugraph_resource_handle_t* handle,
   const cugraph_type_erased_device_array_view_t* srcs,
@@ -1292,6 +1919,7 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
     "fan_out should be of type int",
     *error);
 
+  //  Deprecated functor
   uniform_neighbor_sampling_functor functor{handle,
                                             graph,
                                             start_vertices,
@@ -1369,6 +1997,7 @@ cugraph_error_code_t cugraph_biased_neighbor_sample(
     "fan_out should be of type int",
     *error);
 
+  // Deprecated functor
   biased_neighbor_sampling_functor functor{handle,
                                            graph,
                                            edge_biases,
@@ -1383,3 +2012,249 @@ cugraph_error_code_t cugraph_biased_neighbor_sample(
                                            do_expensive_check};
   return cugraph::c_api::run_algorithm(graph, functor, result, error);
 }
+
+cugraph_error_code_t cugraph_heterogeneous_uniform_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* start_vertex_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  int num_edge_types,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error)
+{
+  auto options_cpp = *reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t const*>(options);
+
+  // FIXME: Should we maintain this contition?
+  CAPI_EXPECTS((!options_cpp.retain_seeds_) || (start_vertex_offsets != nullptr),
+               CUGRAPH_INVALID_INPUT,
+               "must specify start_vertex_offsets if retain_seeds is true",
+               *error);
+
+  CAPI_EXPECTS((start_vertex_offsets == nullptr) ||
+                 (reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                    start_vertex_offsets)
+                    ->type_ == SIZE_T),
+               CUGRAPH_INVALID_INPUT,
+               "start_vertex_offsets should be of type size_t",
+               *error);
+
+  CAPI_EXPECTS(
+    reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)
+        ->type_ == INT32,
+    CUGRAPH_INVALID_INPUT,
+    "fan_out should be of type int",
+    *error);
+
+  CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
+                 reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                   start_vertices)
+                   ->type_,
+               CUGRAPH_INVALID_INPUT,
+               "vertex type of graph and start_vertices must match",
+               *error);
+
+  neighbor_sampling_functor functor{handle,
+                                    rng_state,
+                                    graph,
+                                    nullptr,
+                                    start_vertices,
+                                    start_vertex_offsets,
+                                    fan_out,
+                                    num_edge_types,
+                                    std::move(options_cpp),
+                                    FALSE,
+                                    do_expensive_check};
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
+
+cugraph_error_code_t cugraph_heterogeneous_biased_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_edge_property_view_t* edge_biases,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* start_vertex_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  int num_edge_types,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error)
+{
+  auto options_cpp = *reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t const*>(options);
+
+  CAPI_EXPECTS(
+    (edge_biases != nullptr) ||
+      (reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->edge_weights_ != nullptr),
+    CUGRAPH_INVALID_INPUT,
+    "edge_biases is required if the graph is not weighted",
+    *error);
+
+  // FIXME: Should we maintain this contition?
+  CAPI_EXPECTS((!options_cpp.retain_seeds_) || (start_vertex_offsets != nullptr),
+               CUGRAPH_INVALID_INPUT,
+               "must specify start_vertex_offsets if retain_seeds is true",
+               *error);
+
+  CAPI_EXPECTS((start_vertex_offsets == nullptr) ||
+                 (reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                    start_vertex_offsets)
+                    ->type_ == SIZE_T),
+               CUGRAPH_INVALID_INPUT,
+               "start_vertex_offsets should be of type size_t",
+               *error);
+
+  CAPI_EXPECTS(
+    reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)
+        ->type_ == INT32,
+    CUGRAPH_INVALID_INPUT,
+    "fan_out should be of type int",
+    *error);
+
+  CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
+                 reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                   start_vertices)
+                   ->type_,
+               CUGRAPH_INVALID_INPUT,
+               "vertex type of graph and start_vertices must match",
+               *error);
+
+  neighbor_sampling_functor functor{handle,
+                                    rng_state,
+                                    graph,
+                                    edge_biases,
+                                    start_vertices,
+                                    start_vertex_offsets,
+                                    fan_out,
+                                    num_edge_types,
+                                    std::move(options_cpp),
+                                    TRUE,
+                                    do_expensive_check};
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
+
+cugraph_error_code_t cugraph_homogeneous_uniform_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* start_vertex_offsets,  // RENAME?
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error)
+{
+  auto options_cpp = *reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t const*>(options);
+
+  // FIXME: Should we maintain this contition?
+  CAPI_EXPECTS((!options_cpp.retain_seeds_) || (start_vertex_offsets != nullptr),
+               CUGRAPH_INVALID_INPUT,
+               "must specify start_vertex_offsets if retain_seeds is true",
+               *error);
+
+  CAPI_EXPECTS((start_vertex_offsets == nullptr) ||
+                 (reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                    start_vertex_offsets)
+                    ->type_ == SIZE_T),
+               CUGRAPH_INVALID_INPUT,
+               "start_vertex_offsets should be of type size_t",
+               *error);
+
+  CAPI_EXPECTS(
+    reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)
+        ->type_ == INT32,
+    CUGRAPH_INVALID_INPUT,
+    "fan_out type must be INT32",
+    *error);
+
+  CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
+                 reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                   start_vertices)
+                   ->type_,
+               CUGRAPH_INVALID_INPUT,
+               "vertex type of graph and start_vertices must match",
+               *error);
+
+  neighbor_sampling_functor functor{handle,
+                                    rng_state,
+                                    graph,
+                                    nullptr,
+                                    start_vertices,
+                                    start_vertex_offsets,
+                                    fan_out,
+                                    1,  // num_edge_types
+                                    std::move(options_cpp),
+                                    FALSE,
+                                    do_expensive_check};
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
+
+cugraph_error_code_t cugraph_homogeneous_biased_neighbor_sample(
+  const cugraph_resource_handle_t* handle,
+  cugraph_rng_state_t* rng_state,
+  cugraph_graph_t* graph,
+  const cugraph_edge_property_view_t* edge_biases,
+  const cugraph_type_erased_device_array_view_t* start_vertices,
+  const cugraph_type_erased_device_array_view_t* start_vertex_offsets,
+  const cugraph_type_erased_host_array_view_t* fan_out,
+  const cugraph_sampling_options_t* options,
+  bool_t do_expensive_check,
+  cugraph_sample_result_t** result,
+  cugraph_error_t** error)
+{
+  auto options_cpp = *reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t const*>(options);
+
+  CAPI_EXPECTS(
+    (edge_biases != nullptr) ||
+      (reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->edge_weights_ != nullptr),
+    CUGRAPH_INVALID_INPUT,
+    "edge_biases is required if the graph is not weighted",
+    *error);
+
+  // FIXME: Should we maintain this contition?
+  CAPI_EXPECTS((!options_cpp.retain_seeds_) || (start_vertex_offsets != nullptr),
+               CUGRAPH_INVALID_INPUT,
+               "must specify start_vertex_offsets if retain_seeds is true",
+               *error);
+
+  CAPI_EXPECTS((start_vertex_offsets == nullptr) ||
+                 (reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                    start_vertex_offsets)
+                    ->type_ == SIZE_T),
+               CUGRAPH_INVALID_INPUT,
+               "start_vertex_offsets should be of type size_t",
+               *error);
+
+  CAPI_EXPECTS(
+    reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)
+        ->type_ == INT32,
+    CUGRAPH_INVALID_INPUT,
+    "fan_out type must be INT32",
+    *error);
+
+  CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
+                 reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
+                   start_vertices)
+                   ->type_,
+               CUGRAPH_INVALID_INPUT,
+               "vertex type of graph and start_vertices must match",
+               *error);
+
+  neighbor_sampling_functor functor{handle,
+                                    rng_state,
+                                    graph,
+                                    edge_biases,
+                                    start_vertices,
+                                    start_vertex_offsets,
+                                    fan_out,
+                                    1,  // num_edge_types
+                                    std::move(options_cpp),
+                                    TRUE,
+                                    do_expensive_check};
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
diff --git a/cpp/src/detail/utility_wrappers_32.cu b/cpp/src/detail/utility_wrappers_32.cu
index de407f12493..879a1adf337 100644
--- a/cpp/src/detail/utility_wrappers_32.cu
+++ b/cpp/src/detail/utility_wrappers_32.cu
@@ -63,6 +63,10 @@ template void scalar_fill(raft::handle_t const& handle, size_t* d_value, size_t
 
 template void scalar_fill(raft::handle_t const& handle, float* d_value, size_t size, float value);
 
+template void sort_ints(raft::handle_t const& handle, raft::device_span<int32_t> values);
+
+template size_t unique_ints(raft::handle_t const& handle, raft::device_span<int32_t> values);
+
 template void sequence_fill(rmm::cuda_stream_view const& stream_view,
                             int32_t* d_value,
                             size_t size,
@@ -73,6 +77,10 @@ template void sequence_fill(rmm::cuda_stream_view const& stream_view,
                             size_t size,
                             uint32_t start_value);
 
+template void transform_increment_ints(raft::device_span<int32_t> values,
+                                       int32_t value,
+                                       rmm::cuda_stream_view const& stream_view);
+
 template void stride_fill(rmm::cuda_stream_view const& stream_view,
                           int32_t* d_value,
                           size_t size,
diff --git a/cpp/src/detail/utility_wrappers_64.cu b/cpp/src/detail/utility_wrappers_64.cu
index 2c136d5902b..742cb18d718 100644
--- a/cpp/src/detail/utility_wrappers_64.cu
+++ b/cpp/src/detail/utility_wrappers_64.cu
@@ -61,6 +61,10 @@ template void scalar_fill(raft::handle_t const& handle,
 
 template void scalar_fill(raft::handle_t const& handle, double* d_value, size_t size, double value);
 
+template void sort_ints(raft::handle_t const& handle, raft::device_span<int64_t> values);
+
+template size_t unique_ints(raft::handle_t const& handle, raft::device_span<int64_t> values);
+
 template void sequence_fill(rmm::cuda_stream_view const& stream_view,
                             int64_t* d_value,
                             size_t size,
@@ -71,6 +75,10 @@ template void sequence_fill(rmm::cuda_stream_view const& stream_view,
                             size_t size,
                             uint64_t start_value);
 
+template void transform_increment_ints(raft::device_span<int64_t> values,
+                                       int64_t value,
+                                       rmm::cuda_stream_view const& stream_view);
+
 template void stride_fill(rmm::cuda_stream_view const& stream_view,
                           int64_t* d_value,
                           size_t size,
diff --git a/cpp/src/detail/utility_wrappers_impl.cuh b/cpp/src/detail/utility_wrappers_impl.cuh
index 074d7044261..93bd14c4d06 100644
--- a/cpp/src/detail/utility_wrappers_impl.cuh
+++ b/cpp/src/detail/utility_wrappers_impl.cuh
@@ -36,6 +36,7 @@
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
+#include <thrust/unique.h>
 
 namespace cugraph {
 namespace detail {
@@ -63,6 +64,20 @@ void scalar_fill(raft::handle_t const& handle, value_t* d_value, size_t size, va
   thrust::fill_n(handle.get_thrust_policy(), d_value, size, value);
 }
 
+template <typename value_t>
+void sort_ints(raft::handle_t const& handle, raft::device_span<value_t> values)
+{
+  thrust::sort(handle.get_thrust_policy(), values.begin(), values.end());
+}
+
+template <typename value_t>
+size_t unique_ints(raft::handle_t const& handle, raft::device_span<value_t> values)
+{
+  auto unique_element_last =
+    thrust::unique(handle.get_thrust_policy(), values.begin(), values.end());
+  return thrust::distance(values.begin(), unique_element_last);
+}
+
 template <typename value_t>
 void sequence_fill(rmm::cuda_stream_view const& stream_view,
                    value_t* d_value,
@@ -72,6 +87,20 @@ void sequence_fill(rmm::cuda_stream_view const& stream_view,
   thrust::sequence(rmm::exec_policy(stream_view), d_value, d_value + size, start_value);
 }
 
+template <typename value_t>
+void transform_increment_ints(raft::device_span<value_t> values,
+                              value_t incr,
+                              rmm::cuda_stream_view const& stream_view)
+{
+  thrust::transform(rmm::exec_policy(stream_view),
+                    values.begin(),
+                    values.end(),
+                    values.begin(),
+                    cuda::proclaim_return_type<value_t>([incr] __device__(value_t value) {
+                      return static_cast<value_t>(value + incr);
+                    }));
+}
+
 template <typename value_t>
 void stride_fill(rmm::cuda_stream_view const& stream_view,
                  value_t* d_value,
diff --git a/cpp/src/sampling/detail/conversion_utilities.cu b/cpp/src/sampling/detail/conversion_utilities.cu
new file mode 100644
index 00000000000..0279735dc1f
--- /dev/null
+++ b/cpp/src/sampling/detail/conversion_utilities.cu
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "sampling/detail/sampling_utils.hpp"
+
+#include <cugraph/utilities/misc_utils.cuh>
+
+namespace cugraph {
+namespace detail {
+
+rmm::device_uvector<int32_t> convert_starting_vertex_label_offsets_to_labels(
+  raft::handle_t const& handle, raft::device_span<size_t const> starting_vertex_label_offsets)
+{
+  return expand_sparse_offsets(starting_vertex_label_offsets, int32_t{0}, handle.get_stream());
+}
+
+template <typename label_t>
+rmm::device_uvector<int32_t> flatten_label_map(
+  raft::handle_t const& handle,
+  std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>
+    label_to_output_comm_rank)
+{
+  label_t max_label = thrust::reduce(handle.get_thrust_policy(),
+                                     std::get<0>(label_to_output_comm_rank).begin(),
+                                     std::get<0>(label_to_output_comm_rank).end(),
+                                     label_t{0},
+                                     thrust::maximum<label_t>());
+
+  rmm::device_uvector<int32_t> label_map(max_label + 1, handle.get_stream());
+
+  thrust::fill(handle.get_thrust_policy(), label_map.begin(), label_map.end(), int32_t{0});
+  thrust::scatter(handle.get_thrust_policy(),
+                  std::get<1>(label_to_output_comm_rank).begin(),
+                  std::get<1>(label_to_output_comm_rank).end(),
+                  std::get<0>(label_to_output_comm_rank).begin(),
+                  label_map.begin());
+
+  return label_map;
+}
+
+template rmm::device_uvector<int32_t> flatten_label_map(
+  raft::handle_t const& handle,
+  std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>
+    label_to_output_comm_rank);
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/sampling/detail/sampling_utils.hpp b/cpp/src/sampling/detail/sampling_utils.hpp
index 102f9ec58f7..17eb8dd0873 100644
--- a/cpp/src/sampling/detail/sampling_utils.hpp
+++ b/cpp/src/sampling/detail/sampling_utils.hpp
@@ -293,7 +293,41 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<edge_type_t>>&& edge_types,
   std::optional<rmm::device_uvector<int32_t>>&& hops,
   std::optional<rmm::device_uvector<label_t>>&& labels,
-  std::optional<std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>>
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank);
+
+/**
+ * @brief   Convert the starting vertex offsets into starting vertex labels
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param starting_vertex_label_offsets Offsets array defining where each vertex label begins
+ *
+ * @returns device vector containing labels for each starting vertex
+ */
+rmm::device_uvector<int32_t> convert_starting_vertex_label_offsets_to_labels(
+  raft::handle_t const& handle, raft::device_span<size_t const> starting_vertex_label_offsets);
+
+/**
+ * @brief   Flatten the legacy label_to_output_comm_rank into the new structure
+ *
+ * Legacy structure supported arbitrary labels, the new structure is a dense mapping of labels from
+ * [0,n).
+ *
+ * @tparam label_t typename for the label
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param label_to_output_comm_rank  A tuple containing label ids and the comm rank each label
+ * should be assigned to
+ *
+ * @returns device vector containing the mapping to comm_rank.  Entry `i` will be the comm rank
+ * destination for label `i`.
+ */
+template <typename label_t>
+rmm::device_uvector<int32_t> flatten_label_map(
+  raft::handle_t const& handle,
+  std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>
     label_to_output_comm_rank);
+
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/sampling/detail/shuffle_and_organize_output_impl.cuh b/cpp/src/sampling/detail/shuffle_and_organize_output_impl.cuh
index ec14e99baec..391dd99b1df 100644
--- a/cpp/src/sampling/detail/shuffle_and_organize_output_impl.cuh
+++ b/cpp/src/sampling/detail/shuffle_and_organize_output_impl.cuh
@@ -41,14 +41,12 @@ namespace detail {
 
 template <typename label_t>
 struct shuffle_to_output_comm_rank_t {
-  raft::device_span<label_t const> output_label_;
   raft::device_span<int32_t const> output_rank_;
 
   template <typename key_t>
   __device__ int32_t operator()(key_t key) const
   {
-    auto pos = thrust::lower_bound(thrust::seq, output_label_.begin(), output_label_.end(), key);
-    return output_rank_[thrust::distance(output_label_.begin(), pos)];
+    return output_rank_[key];
   }
 };
 
@@ -206,8 +204,7 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<edge_type_t>>&& edge_types,
   std::optional<rmm::device_uvector<int32_t>>&& hops,
   std::optional<rmm::device_uvector<label_t>>&& labels,
-  std::optional<std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank)
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank)
 {
   std::optional<rmm::device_uvector<size_t>> offsets{std::nullopt};
 
@@ -215,8 +212,6 @@ shuffle_and_organize_output(
     sort_sampled_tuples(handle, majors, minors, weights, edge_ids, edge_types, hops, *labels);
 
     if (label_to_output_comm_rank) {
-      CUGRAPH_EXPECTS(labels, "labels must be specified in order to shuffle sampling results");
-
       auto& comm           = handle.get_comms();
       auto const comm_size = comm.get_size();
 
@@ -247,8 +242,7 @@ shuffle_and_organize_output(
                                           edge_ids->begin(),
                                           edge_types->begin(),
                                           hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -282,8 +276,7 @@ shuffle_and_organize_output(
                                           weights->begin(),
                                           edge_ids->begin(),
                                           edge_types->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -317,8 +310,7 @@ shuffle_and_organize_output(
                                           weights->begin(),
                                           edge_ids->begin(),
                                           hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -347,8 +339,7 @@ shuffle_and_organize_output(
                 labels->end(),
                 thrust::make_zip_iterator(
                   majors.begin(), minors.begin(), weights->begin(), edge_ids->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -383,8 +374,7 @@ shuffle_and_organize_output(
                                           weights->begin(),
                                           edge_types->begin(),
                                           hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -413,8 +403,7 @@ shuffle_and_organize_output(
                 labels->end(),
                 thrust::make_zip_iterator(
                   majors.begin(), minors.begin(), weights->begin(), edge_types->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -444,8 +433,7 @@ shuffle_and_organize_output(
                 labels->end(),
                 thrust::make_zip_iterator(
                   majors.begin(), minors.begin(), weights->begin(), hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -471,8 +459,7 @@ shuffle_and_organize_output(
                 labels->begin(),
                 labels->end(),
                 thrust::make_zip_iterator(majors.begin(), minors.begin(), weights->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -505,8 +492,7 @@ shuffle_and_organize_output(
                                           edge_ids->begin(),
                                           edge_types->begin(),
                                           hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -535,8 +521,7 @@ shuffle_and_organize_output(
                 labels->end(),
                 thrust::make_zip_iterator(
                   majors.begin(), minors.begin(), edge_ids->begin(), edge_types->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -566,8 +551,7 @@ shuffle_and_organize_output(
                 labels->end(),
                 thrust::make_zip_iterator(
                   majors.begin(), minors.begin(), edge_ids->begin(), hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -593,8 +577,7 @@ shuffle_and_organize_output(
                 labels->begin(),
                 labels->end(),
                 thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_ids->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -623,8 +606,7 @@ shuffle_and_organize_output(
                 labels->end(),
                 thrust::make_zip_iterator(
                   majors.begin(), minors.begin(), edge_types->begin(), hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -651,8 +633,7 @@ shuffle_and_organize_output(
                 labels->begin(),
                 labels->end(),
                 thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_types->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -678,8 +659,7 @@ shuffle_and_organize_output(
                 labels->begin(),
                 labels->end(),
                 thrust::make_zip_iterator(majors.begin(), minors.begin(), hops->begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
@@ -702,8 +682,7 @@ shuffle_and_organize_output(
                 labels->begin(),
                 labels->end(),
                 thrust::make_zip_iterator(majors.begin(), minors.begin()),
-                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
-                                                       std::get<1>(*label_to_output_comm_rank)},
+                shuffle_to_output_comm_rank_t<label_t>{*label_to_output_comm_rank},
                 comm_size,
                 mem_frugal_threshold,
                 handle.get_stream());
diff --git a/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v32_e32.cu b/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v32_e32.cu
index 73a152487ca..4a264469c97 100644
--- a/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v32_e32.cu
+++ b/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v32_e32.cu
@@ -36,8 +36,7 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<int32_t>>&& edge_types,
   std::optional<rmm::device_uvector<int32_t>>&& hops,
   std::optional<rmm::device_uvector<int32_t>>&& labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank);
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -56,8 +55,7 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<int32_t>>&& edge_types,
   std::optional<rmm::device_uvector<int32_t>>&& hops,
   std::optional<rmm::device_uvector<int32_t>>&& labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank);
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v64_e64.cu b/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v64_e64.cu
index ff7a716e609..f66ce3e2d63 100644
--- a/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v64_e64.cu
+++ b/cpp/src/sampling/detail/shuffle_and_organize_output_mg_v64_e64.cu
@@ -36,8 +36,7 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<int32_t>>&& edge_types,
   std::optional<rmm::device_uvector<int32_t>>&& hops,
   std::optional<rmm::device_uvector<int32_t>>&& labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank);
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -56,8 +55,7 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<int32_t>>&& edge_types,
   std::optional<rmm::device_uvector<int32_t>>&& hops,
   std::optional<rmm::device_uvector<int32_t>>&& labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank);
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_impl.hpp b/cpp/src/sampling/neighbor_sampling_impl.hpp
index d8e8cc2b756..ccca71cdf20 100644
--- a/cpp/src/sampling/neighbor_sampling_impl.hpp
+++ b/cpp/src/sampling/neighbor_sampling_impl.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "prims/fill_edge_property.cuh"
+#include "prims/transform_e.cuh"
 #include "sampling/detail/sampling_utils.hpp"
 
 #include <cugraph/detail/shuffle_wrappers.hpp>
@@ -48,41 +50,34 @@ std::tuple<rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<int32_t>>,
            std::optional<rmm::device_uvector<label_t>>,
            std::optional<rmm::device_uvector<size_t>>>
-neighbor_sample_impl(
-  raft::handle_t const& handle,
-  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
-  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
-  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
-  std::optional<edge_property_view_t<edge_t, bias_t const*>> edge_bias_view,
-  raft::device_span<vertex_t const> this_frontier_vertices,
-  std::optional<raft::device_span<label_t const>> this_frontier_vertex_labels,
-  std::optional<std::tuple<raft::device_span<label_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  raft::random::RngState& rng_state,
-  bool do_expensive_check)
+neighbor_sample_impl(raft::handle_t const& handle,
+                     raft::random::RngState& rng_state,
+                     graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+                     std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+                     std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+                     std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+                     std::optional<edge_property_view_t<edge_t, bias_t const*>> edge_bias_view,
+                     raft::device_span<vertex_t const> starting_vertices,
+                     std::optional<raft::device_span<label_t const>> starting_vertex_labels,
+                     std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+                     raft::host_span<int32_t const> fan_out,
+                     edge_type_t num_edge_types,
+                     bool return_hops,
+                     bool with_replacement,
+                     prior_sources_behavior_t prior_sources_behavior,
+                     bool dedupe_sources,
+                     bool do_expensive_check)
 {
   static_assert(std::is_floating_point_v<bias_t>);
 
-  CUGRAPH_EXPECTS(fan_out.size() > 0, "Invalid input argument: number of levels must be non-zero.");
-  CUGRAPH_EXPECTS(
-    fan_out.size() <= static_cast<size_t>(std::numeric_limits<int32_t>::max()),
-    "Invalid input argument: number of levels should not overflow int32_t");  // as we use int32_t
-                                                                              // to store hops
-
   if constexpr (!multi_gpu) {
     CUGRAPH_EXPECTS(!label_to_output_comm_rank,
                     "cannot specify output GPU mapping in SG implementation");
   }
 
   CUGRAPH_EXPECTS(
-    !label_to_output_comm_rank || this_frontier_vertex_labels,
-    "cannot specify output GPU mapping without also specifying this_frontier_vertex_labels");
+    !label_to_output_comm_rank || starting_vertex_labels,
+    "cannot specify output GPU mapping without also specifying starting_vertex_labels");
 
   if (do_expensive_check) {
     if (edge_bias_view) {
@@ -96,10 +91,45 @@ neighbor_sample_impl(
                       "Invalid input argument: sum of neighboring edge bias values should not "
                       "exceed std::numeric_limits<bias_t>::max() for any vertex.");
     }
+  }
+
+  CUGRAPH_EXPECTS(fan_out.size() > 0, "Invalid input argument: number of levels must be non-zero.");
+  CUGRAPH_EXPECTS(
+    fan_out.size() <= static_cast<size_t>(std::numeric_limits<int32_t>::max()),
+    "Invalid input argument: number of levels should not overflow int32_t");  // as we use int32_t
+                                                                              // to store hops
 
-    if (label_to_output_comm_rank) {
-      CUGRAPH_EXPECTS(cugraph::detail::is_sorted(handle, std::get<0>(*label_to_output_comm_rank)),
-                      "Labels in label_to_output_comm_rank must be sorted");
+  std::vector<
+    cugraph::edge_property_t<graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>, bool>>
+    edge_masks_vector{};
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> modified_graph_view = graph_view;
+  edge_masks_vector.reserve(num_edge_types);
+
+  if (num_edge_types > 1) {
+    for (int i = 0; i < num_edge_types; i++) {
+      cugraph::edge_property_t<graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>, bool>
+        edge_mask(handle, graph_view);
+
+      cugraph::fill_edge_property(
+        handle, modified_graph_view, edge_mask.mutable_view(), bool{true});
+
+      cugraph::transform_e(
+        handle,
+        modified_graph_view,
+        cugraph::edge_src_dummy_property_t{}.view(),
+        cugraph::edge_dst_dummy_property_t{}.view(),
+        *edge_type_view,
+        [valid_edge_type = i] __device__(auto src,
+                                         auto dst,
+                                         thrust::nullopt_t,
+                                         thrust::nullopt_t,
+                                         /*thrust::nullopt_t*/ auto edge_type) {
+          return edge_type == valid_edge_type;
+        },
+        edge_mask.mutable_view(),
+        false);
+
+      edge_masks_vector.push_back(std::move(edge_mask));
     }
   }
 
@@ -114,8 +144,8 @@ neighbor_sample_impl(
     edge_type_view ? std::make_optional(std::vector<rmm::device_uvector<edge_type_t>>{})
                    : std::nullopt;
   auto level_result_label_vectors =
-    this_frontier_vertex_labels ? std::make_optional(std::vector<rmm::device_uvector<label_t>>{})
-                                : std::nullopt;
+    starting_vertex_labels ? std::make_optional(std::vector<rmm::device_uvector<label_t>>{})
+                           : std::nullopt;
 
   level_result_src_vectors.reserve(fan_out.size());
   level_result_dst_vectors.reserve(fan_out.size());
@@ -126,7 +156,7 @@ neighbor_sample_impl(
 
   rmm::device_uvector<vertex_t> frontier_vertices(0, handle.get_stream());
   auto frontier_vertex_labels =
-    this_frontier_vertex_labels
+    starting_vertex_labels
       ? std::make_optional(rmm::device_uvector<label_t>{0, handle.get_stream()})
       : std::nullopt;
 
@@ -137,84 +167,95 @@ neighbor_sample_impl(
   if (prior_sources_behavior == prior_sources_behavior_t::EXCLUDE) {
     vertex_used_as_source = std::make_optional(
       std::make_tuple(rmm::device_uvector<vertex_t>{0, handle.get_stream()},
-                      this_frontier_vertex_labels
+                      starting_vertex_labels
                         ? std::make_optional(rmm::device_uvector<label_t>{0, handle.get_stream()})
                         : std::nullopt));
   }
 
   std::vector<size_t> level_sizes{};
-  int32_t hop{0};
-  for (auto&& k_level : fan_out) {
-    rmm::device_uvector<vertex_t> srcs(0, handle.get_stream());
-    rmm::device_uvector<vertex_t> dsts(0, handle.get_stream());
-    std::optional<rmm::device_uvector<weight_t>> weights{std::nullopt};
-    std::optional<rmm::device_uvector<edge_t>> edge_ids{std::nullopt};
-    std::optional<rmm::device_uvector<edge_type_t>> edge_types{std::nullopt};
-    std::optional<rmm::device_uvector<int32_t>> labels{std::nullopt};
-
-    if (k_level > 0) {
-      std::tie(srcs, dsts, weights, edge_ids, edge_types, labels) =
-        sample_edges(handle,
-                     graph_view,
-                     edge_weight_view,
-                     edge_id_view,
-                     edge_type_view,
-                     edge_bias_view,
-                     rng_state,
-                     this_frontier_vertices,
-                     this_frontier_vertex_labels,
-                     static_cast<size_t>(k_level),
-                     with_replacement);
-    } else {
-      std::tie(srcs, dsts, weights, edge_ids, edge_types, labels) =
-        gather_one_hop_edgelist(handle,
-                                graph_view,
-                                edge_weight_view,
-                                edge_id_view,
-                                edge_type_view,
-                                this_frontier_vertices,
-                                this_frontier_vertex_labels);
-    }
 
-    level_sizes.push_back(srcs.size());
-
-    level_result_src_vectors.push_back(std::move(srcs));
-    level_result_dst_vectors.push_back(std::move(dsts));
-    if (weights) { (*level_result_weight_vectors).push_back(std::move(*weights)); }
-    if (edge_ids) { (*level_result_edge_id_vectors).push_back(std::move(*edge_ids)); }
-    if (edge_types) { (*level_result_edge_type_vectors).push_back(std::move(*edge_types)); }
-    if (labels) { (*level_result_label_vectors).push_back(std::move(*labels)); }
-
-    ++hop;
-    if (hop < fan_out.size()) {
-      // FIXME:  We should modify vertex_partition_range_lasts to return a raft::host_span
-      //  rather than making a copy.
-      auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
-      std::tie(frontier_vertices, frontier_vertex_labels, vertex_used_as_source) =
-        prepare_next_frontier(
-          handle,
-          this_frontier_vertices,
-          this_frontier_vertex_labels,
-          raft::device_span<vertex_t const>{level_result_dst_vectors.back().data(),
-                                            level_result_dst_vectors.back().size()},
-          frontier_vertex_labels ? std::make_optional(raft::device_span<label_t const>(
-                                     level_result_label_vectors->back().data(),
-                                     level_result_label_vectors->back().size()))
-                                 : std::nullopt,
-          std::move(vertex_used_as_source),
-          graph_view.local_vertex_partition_view(),
-          vertex_partition_range_lasts,
-          prior_sources_behavior,
-          dedupe_sources,
-          do_expensive_check);
-
-      this_frontier_vertices =
-        raft::device_span<vertex_t const>(frontier_vertices.data(), frontier_vertices.size());
-
-      if (frontier_vertex_labels) {
-        this_frontier_vertex_labels = raft::device_span<label_t const>(
-          frontier_vertex_labels->data(), frontier_vertex_labels->size());
+  // Get the number of hop. If homogeneous neighbor sample, num_edge_types = 1
+  auto num_hops = ((fan_out.size() % num_edge_types) == 0)
+                    ? (fan_out.size() / num_edge_types)
+                    : ((fan_out.size() / num_edge_types) + 1);
+
+  for (auto hop = 0; hop < num_hops; hop++) {
+    for (auto edge_type_id = 0; edge_type_id < num_edge_types; edge_type_id++) {
+      auto k_level = fan_out[(hop * num_edge_types) + edge_type_id];
+      rmm::device_uvector<vertex_t> srcs(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> dsts(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> weights{std::nullopt};
+      std::optional<rmm::device_uvector<edge_t>> edge_ids{std::nullopt};
+      std::optional<rmm::device_uvector<edge_type_t>> edge_types{std::nullopt};
+      std::optional<rmm::device_uvector<int32_t>> labels{std::nullopt};
+
+      if (num_edge_types > 1) {
+        modified_graph_view.attach_edge_mask(edge_masks_vector[edge_type_id].view());
+      }
+
+      if (k_level > 0) {
+        std::tie(srcs, dsts, weights, edge_ids, edge_types, labels) =
+          sample_edges(handle,
+                       modified_graph_view,
+                       edge_weight_view,
+                       edge_id_view,
+                       edge_type_view,
+                       edge_bias_view,
+                       rng_state,
+                       starting_vertices,
+                       starting_vertex_labels,
+                       static_cast<size_t>(k_level),
+                       with_replacement);
+      } else {
+        std::tie(srcs, dsts, weights, edge_ids, edge_types, labels) =
+          gather_one_hop_edgelist(handle,
+                                  modified_graph_view,
+                                  edge_weight_view,
+                                  edge_id_view,
+                                  edge_type_view,
+                                  starting_vertices,
+                                  starting_vertex_labels);
       }
+
+      level_sizes.push_back(srcs.size());
+      level_result_src_vectors.push_back(std::move(srcs));
+      level_result_dst_vectors.push_back(std::move(dsts));
+
+      if (weights) { (*level_result_weight_vectors).push_back(std::move(*weights)); }
+      if (edge_ids) { (*level_result_edge_id_vectors).push_back(std::move(*edge_ids)); }
+      if (edge_types) { (*level_result_edge_type_vectors).push_back(std::move(*edge_types)); }
+      if (labels) { (*level_result_label_vectors).push_back(std::move(*labels)); }
+
+      if (num_edge_types > 1) { modified_graph_view.clear_edge_mask(); }
+    }
+
+    // FIXME:  We should modify vertex_partition_range_lasts to return a raft::host_span
+    //  rather than making a copy.
+    auto vertex_partition_range_lasts = modified_graph_view.vertex_partition_range_lasts();
+    std::tie(frontier_vertices, frontier_vertex_labels, vertex_used_as_source) =
+      prepare_next_frontier(
+        handle,
+        starting_vertices,
+        starting_vertex_labels,
+        raft::device_span<vertex_t const>{level_result_dst_vectors.back().data(),
+                                          level_result_dst_vectors.back().size()},
+        frontier_vertex_labels
+          ? std::make_optional(raft::device_span<label_t const>(
+              level_result_label_vectors->back().data(), level_result_label_vectors->back().size()))
+          : std::nullopt,
+        std::move(vertex_used_as_source),
+        modified_graph_view.local_vertex_partition_view(),
+        vertex_partition_range_lasts,
+        prior_sources_behavior,
+        dedupe_sources,
+        do_expensive_check);
+
+    starting_vertices =
+      raft::device_span<vertex_t const>(frontier_vertices.data(), frontier_vertices.size());
+
+    if (frontier_vertex_labels) {
+      starting_vertex_labels = raft::device_span<label_t const>(frontier_vertex_labels->data(),
+                                                                frontier_vertex_labels->size());
     }
   }
 
@@ -368,8 +409,16 @@ uniform_neighbor_sample(
   bool do_expensive_check)
 {
   using bias_t = weight_t;  // dummy
+
+  rmm::device_uvector<int32_t> label_map(0, handle.get_stream());
+
+  if (label_to_output_comm_rank) {
+    label_map = detail::flatten_label_map(handle, *label_to_output_comm_rank);
+  }
+
   return detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
     handle,
+    rng_state,
     graph_view,
     edge_weight_view,
     edge_id_view,
@@ -377,13 +426,15 @@ uniform_neighbor_sample(
     std::nullopt,
     starting_vertices,
     starting_vertex_labels,
-    label_to_output_comm_rank,
+    label_to_output_comm_rank
+      ? std::make_optional(raft::device_span<int32_t const>{label_map.data(), label_map.size()})
+      : std::nullopt,
     fan_out,
+    edge_type_t{1},
     return_hops,
     with_replacement,
     prior_sources_behavior,
     dedupe_sources,
-    rng_state,
     do_expensive_check);
 }
 
@@ -422,8 +473,15 @@ biased_neighbor_sample(
   bool dedupe_sources,
   bool do_expensive_check)
 {
+  rmm::device_uvector<int32_t> label_map(0, handle.get_stream());
+
+  if (label_to_output_comm_rank) {
+    label_map = detail::flatten_label_map(handle, *label_to_output_comm_rank);
+  }
+
   return detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
     handle,
+    rng_state,
     graph_view,
     edge_weight_view,
     edge_id_view,
@@ -431,14 +489,252 @@ biased_neighbor_sample(
     edge_bias_view,
     starting_vertices,
     starting_vertex_labels,
-    label_to_output_comm_rank,
+    label_to_output_comm_rank
+      ? std::make_optional(raft::device_span<int32_t const>{label_map.data(), label_map.size()})
+      : std::nullopt,
     fan_out,
+    edge_type_t{1},
     return_hops,
     with_replacement,
     prior_sources_behavior,
     dedupe_sources,
-    rng_state,
     do_expensive_check);
 }
 
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  edge_type_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check)
+{
+  using bias_t = weight_t;  // dummy
+
+  auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
+    detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
+      handle,
+      rng_state,
+      graph_view,
+      edge_weight_view,
+      edge_id_view,
+      edge_type_view,
+      std::optional<edge_property_view_t<edge_t, bias_t const*>>{
+        std::nullopt},  // Optional edge_bias_view
+      starting_vertices,
+      starting_vertex_labels,
+      label_to_output_comm_rank,
+      fan_out,
+      num_edge_types,
+      sampling_flags.return_hops,
+      sampling_flags.with_replacement,
+      sampling_flags.prior_sources_behavior,
+      sampling_flags.dedupe_sources,
+      do_expensive_check);
+
+  return std::make_tuple(std::move(majors),
+                         std::move(minors),
+                         std::move(weights),
+                         std::move(edge_ids),
+                         std::move(edge_types),
+                         std::move(hops),
+                         std::move(offsets));
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          typename bias_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  edge_property_view_t<edge_t, bias_t const*> edge_bias_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  edge_type_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check)
+{
+  auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
+    detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
+      handle,
+      rng_state,
+      graph_view,
+      edge_weight_view,
+      edge_id_view,
+      edge_type_view,
+      std::make_optional(edge_bias_view),
+      starting_vertices,
+      starting_vertex_labels,
+      label_to_output_comm_rank,
+      fan_out,
+      num_edge_types,
+      sampling_flags.return_hops,
+      sampling_flags.with_replacement,
+      sampling_flags.prior_sources_behavior,
+      sampling_flags.dedupe_sources,
+      do_expensive_check);
+
+  return std::make_tuple(std::move(majors),
+                         std::move(minors),
+                         std::move(weights),
+                         std::move(edge_ids),
+                         std::move(edge_types),
+                         std::move(hops),
+                         std::move(offsets));
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check)
+{
+  using bias_t = weight_t;  // dummy
+
+  auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
+    detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
+      handle,
+      rng_state,
+      graph_view,
+      edge_weight_view,
+      edge_id_view,
+      edge_type_view,
+      std::optional<edge_property_view_t<edge_t, bias_t const*>>{
+        std::nullopt},  // Optional edge_bias_view
+      starting_vertices,
+      starting_vertex_labels,
+      label_to_output_comm_rank,
+      fan_out,
+      edge_type_t{1},
+      sampling_flags.return_hops,
+      sampling_flags.with_replacement,
+      sampling_flags.prior_sources_behavior,
+      sampling_flags.dedupe_sources,
+      do_expensive_check);
+
+  return std::make_tuple(std::move(majors),
+                         std::move(minors),
+                         std::move(weights),
+                         std::move(edge_ids),
+                         std::move(edge_types),
+                         std::move(hops),
+                         std::move(offsets));
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_type_t,
+          typename bias_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<rmm::device_uvector<int32_t>>,
+           std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  std::optional<edge_property_view_t<edge_t, edge_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<edge_t, edge_type_t const*>> edge_type_view,
+  edge_property_view_t<edge_t, bias_t const*> edge_bias_view,
+  raft::device_span<vertex_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check)
+{
+  auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
+    detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
+      handle,
+      rng_state,
+      graph_view,
+      edge_weight_view,
+      edge_id_view,
+      edge_type_view,
+      std::make_optional(edge_bias_view),
+      starting_vertices,
+      starting_vertex_labels,
+      label_to_output_comm_rank,
+      fan_out,
+      edge_type_t{1},
+      sampling_flags.return_hops,
+      sampling_flags.with_replacement,
+      sampling_flags.prior_sources_behavior,
+      sampling_flags.dedupe_sources,
+      do_expensive_check);
+
+  return std::make_tuple(std::move(majors),
+                         std::move(minors),
+                         std::move(weights),
+                         std::move(edge_ids),
+                         std::move(edge_types),
+                         std::move(hops),
+                         std::move(offsets));
+}
+
 }  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_mg_v32_e32.cpp b/cpp/src/sampling/neighbor_sampling_mg_v32_e32.cpp
deleted file mode 100644
index f61c1c10c53..00000000000
--- a/cpp/src/sampling/neighbor_sampling_mg_v32_e32.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "neighbor_sampling_impl.hpp"
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/sampling_functions.hpp>
-
-namespace cugraph {
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int32_t, float const*> edge_bias_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int32_t, double const*> edge_bias_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_mg_v32_e32.cu b/cpp/src/sampling/neighbor_sampling_mg_v32_e32.cu
new file mode 100644
index 00000000000..d848935cc7e
--- /dev/null
+++ b/cpp/src/sampling/neighbor_sampling_mg_v32_e32.cu
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "neighbor_sampling_impl.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, float const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, double const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, double const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, float const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, double const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, float const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_mg_v64_e64.cpp b/cpp/src/sampling/neighbor_sampling_mg_v64_e64.cpp
deleted file mode 100644
index ea3f6b466da..00000000000
--- a/cpp/src/sampling/neighbor_sampling_mg_v64_e64.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "neighbor_sampling_impl.hpp"
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/sampling_functions.hpp>
-
-namespace cugraph {
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int64_t, float const*> edge_bias_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int64_t, double const*> edge_bias_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_mg_v64_e64.cu b/cpp/src/sampling/neighbor_sampling_mg_v64_e64.cu
new file mode 100644
index 00000000000..505deec51f5
--- /dev/null
+++ b/cpp/src/sampling/neighbor_sampling_mg_v64_e64.cu
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "neighbor_sampling_impl.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, float const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, double const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, double const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, float const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, double const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, float const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_sg_v32_e32.cpp b/cpp/src/sampling/neighbor_sampling_sg_v32_e32.cpp
deleted file mode 100644
index 0f0affbb323..00000000000
--- a/cpp/src/sampling/neighbor_sampling_sg_v32_e32.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "neighbor_sampling_impl.hpp"
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/sampling_functions.hpp>
-
-namespace cugraph {
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int32_t, float const*> edge_bias_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int32_t>,
-                    rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int32_t, double const*> edge_bias_view,
-  raft::device_span<int32_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_sg_v32_e32.cu b/cpp/src/sampling/neighbor_sampling_sg_v32_e32.cu
new file mode 100644
index 00000000000..72bbb4e27a8
--- /dev/null
+++ b/cpp/src/sampling/neighbor_sampling_sg_v32_e32.cu
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "neighbor_sampling_impl.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, float const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, double const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, double const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, float const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, double const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int32_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int32_t, float const*> edge_bias_view,
+  raft::device_span<int32_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_sg_v64_e64.cpp b/cpp/src/sampling/neighbor_sampling_sg_v64_e64.cpp
deleted file mode 100644
index 70dd9a59842..00000000000
--- a/cpp/src/sampling/neighbor_sampling_sg_v64_e64.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "neighbor_sampling_impl.hpp"
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/sampling_functions.hpp>
-
-namespace cugraph {
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-uniform_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<float>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int64_t, float const*> edge_bias_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-template std::tuple<rmm::device_uvector<int64_t>,
-                    rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<double>>,
-                    std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<size_t>>>
-biased_neighbor_sample(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
-  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
-  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
-  edge_property_view_t<int64_t, double const*> edge_bias_view,
-  raft::device_span<int64_t const> starting_vertices,
-  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
-  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
-    label_to_output_comm_rank,
-  raft::host_span<int32_t const> fan_out,
-  raft::random::RngState& rng_state,
-  bool return_hops,
-  bool with_replacement,
-  prior_sources_behavior_t prior_sources_behavior,
-  bool dedupe_sources,
-  bool do_expensive_check);
-
-}  // namespace cugraph
diff --git a/cpp/src/sampling/neighbor_sampling_sg_v64_e64.cu b/cpp/src/sampling/neighbor_sampling_sg_v64_e64.cu
new file mode 100644
index 00000000000..6aa8c71429a
--- /dev/null
+++ b/cpp/src/sampling/neighbor_sampling_sg_v64_e64.cu
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "neighbor_sampling_impl.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/sampling_functions.hpp>
+
+namespace cugraph {
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, float const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+biased_neighbor_sample(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, double const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<std::tuple<raft::device_span<int32_t const>, raft::device_span<int32_t const>>>
+    label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  raft::random::RngState& rng_state,
+  bool return_hops,
+  bool with_replacement,
+  prior_sources_behavior_t prior_sources_behavior,
+  bool dedupe_sources,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, double const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+heterogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, float const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  int32_t num_edge_types,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_uniform_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, double const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+homogeneous_biased_neighbor_sample(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  std::optional<edge_property_view_t<int64_t, int64_t const*>> edge_id_view,
+  std::optional<edge_property_view_t<int64_t, int32_t const*>> edge_type_view,
+  edge_property_view_t<int64_t, float const*> edge_bias_view,
+  raft::device_span<int64_t const> starting_vertices,
+  std::optional<raft::device_span<int32_t const>> starting_vertex_labels,
+  std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank,
+  raft::host_span<int32_t const> fan_out,
+  sampling_flags_t sampling_flags,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/utilities/shuffle_vertices.cuh b/cpp/src/utilities/shuffle_vertices.cuh
index adce03f7c29..5ed6513816f 100644
--- a/cpp/src/utilities/shuffle_vertices.cuh
+++ b/cpp/src/utilities/shuffle_vertices.cuh
@@ -44,22 +44,43 @@ rmm::device_uvector<vertex_t> shuffle_vertices_by_gpu_id_impl(
   return d_rx_vertices;
 }
 
-template <typename vertex_t, typename value_t, typename func_t>
-std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<value_t>>
-shuffle_vertices_and_values_by_gpu_id_impl(raft::handle_t const& handle,
-                                           rmm::device_uvector<vertex_t>&& d_vertices,
-                                           rmm::device_uvector<value_t>&& d_values,
-                                           func_t func)
+template <typename vertex_t, typename value0_t, typename value1_t, typename func_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<value0_t>,
+           std::optional<rmm::device_uvector<value1_t>>>
+shuffle_vertices_and_values_by_gpu_id_impl(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& d_vertices,
+  rmm::device_uvector<value0_t>&& d_values_0,
+  std::optional<rmm::device_uvector<value1_t>>&& d_values_1,
+  func_t func)
 {
-  std::tie(d_vertices, d_values, std::ignore) = cugraph::groupby_gpu_id_and_shuffle_kv_pairs(
-    handle.get_comms(),
-    d_vertices.begin(),
-    d_vertices.end(),
-    d_values.begin(),
-    [key_func = func] __device__(auto val) { return key_func(val); },
-    handle.get_stream());
-
-  return std::make_tuple(std::move(d_vertices), std::move(d_values));
+  if (d_values_1) {
+    auto [d_shuffled_vertices, d_values, counts] = cugraph::groupby_gpu_id_and_shuffle_kv_pairs(
+      handle.get_comms(),
+      d_vertices.begin(),
+      d_vertices.end(),
+      thrust::make_zip_iterator(d_values_0.begin(), (*d_values_1).begin()),
+      [key_func = func] __device__(auto val) { return key_func(val); },
+      handle.get_stream());
+
+    return std::make_tuple(std::move(d_shuffled_vertices),
+                           std::move(std::get<0>(d_values)),
+                           std::make_optional(std::move(std::get<1>(d_values))));
+  } else {
+    auto [d_shuffled_vertices, d_values, counts] = cugraph::groupby_gpu_id_and_shuffle_kv_pairs(
+      handle.get_comms(),
+      d_vertices.begin(),
+      d_vertices.end(),
+      d_values_0.begin(),
+      [key_func = func] __device__(auto val) { return key_func(val); },
+      handle.get_stream());
+
+    auto d_values_1 = std::optional<rmm::device_uvector<int32_t>>{std::nullopt};
+
+    return std::make_tuple(
+      std::move(d_shuffled_vertices), std::move(d_values), std::move(d_values_1));
+  }
 }
 
 }  // namespace
@@ -96,12 +117,18 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
   auto const minor_comm_size = minor_comm.get_size();
 
-  return shuffle_vertices_and_values_by_gpu_id_impl(
+  rmm::device_uvector<vertex_t> d_vertices(0, handle.get_stream());
+  rmm::device_uvector<value_t> d_values(0, handle.get_stream());
+
+  std::tie(d_vertices, d_values, std::ignore) = shuffle_vertices_and_values_by_gpu_id_impl(
     handle,
     std::move(vertices),
     std::move(values),
+    std::optional<rmm::device_uvector<int32_t>>{std::nullopt},
     cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size});
+
+  return std::make_tuple(std::move(d_vertices), std::move(d_values));
 }
 
 template <typename vertex_t>
@@ -154,17 +181,21 @@ shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
   auto const minor_comm_size = minor_comm.get_size();
 
-  auto return_value = shuffle_vertices_and_values_by_gpu_id_impl(
+  rmm::device_uvector<vertex_t> d_vertices(0, handle.get_stream());
+  rmm::device_uvector<value_t> d_values(0, handle.get_stream());
+
+  std::tie(d_vertices, d_values, std::ignore) = shuffle_vertices_and_values_by_gpu_id_impl(
     handle,
     std::move(vertices),
     std::move(values),
+    std::optional<rmm::device_uvector<int32_t>>{std::nullopt},
     cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
       raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
                                         d_vertex_partition_range_lasts.size()),
       major_comm_size,
       minor_comm_size});
 
-  return return_value;
+  return std::make_tuple(std::move(d_vertices), std::move(d_values));
 }
 
 }  // namespace detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3752e823659..a2eeafea8cf 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -483,10 +483,29 @@ ConfigureTest(RANDOM_WALKS_TEST sampling/sg_random_walks_test.cpp)
 # - UNIFORM NBR SAMPLING tests --------------------------------------------------------------------
 ConfigureTest(UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/uniform_neighbor_sampling.cpp)
 
+# - HOMOGENEOUS UNIFORM NBR SAMPLING tests --------------------------------------------------------
+ConfigureTest(
+    HOMOGENEOUS_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/homogeneous_uniform_neighbor_sampling.cpp)
+
+# - HETEROGENEOUS UNIFORM NBR SAMPLING tests -----------------------------------------------------
+ConfigureTest(
+    HETEROGENEOUS_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/heterogeneous_uniform_neighbor_sampling.cpp)
+
 ###################################################################################################
 # - BIASED NBR SAMPLING tests ---------------------------------------------------------------------
 ConfigureTest(BIASED_NEIGHBOR_SAMPLING_TEST sampling/biased_neighbor_sampling.cpp)
 
+###################################################################################################
+# - HOMOGENEOUS BIASED NBR SAMPLING tests ---------------------------------------------------------
+ConfigureTest(
+    HOMOGENEOUS_BIASED_NEIGHBOR_SAMPLING_TEST sampling/homogeneous_biased_neighbor_sampling.cpp)
+
+###################################################################################################
+# - HETEROGENEOUS BIASED NBR SAMPLING tests -------------------------------------------------------
+ConfigureTest(
+    HETEROGENEOUS_BIASED_NEIGHBOR_SAMPLING_TESTT sampling/heterogeneous_biased_neighbor_sampling.cpp
+        GPUS 1 PERCENT 75)
+
 ###################################################################################################
 # - SAMPLING_POST_PROCESSING tests ----------------------------------------------------------------
 ConfigureTest(SAMPLING_POST_PROCESSING_TEST sampling/sampling_post_processing_test.cpp)
@@ -751,6 +770,26 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG UNIFORM NBR SAMPLING tests -------------------------------------------------------------
     ConfigureTestMG(MG_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/mg_uniform_neighbor_sampling.cpp)
 
+    ###############################################################################################
+    # - MG HOMOGENEOUS UNIFORM NBR SAMPLING tests -------------------------------------------------
+    ConfigureTestMG(
+        MG_HOMOGENEOUS_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/mg_homogeneous_uniform_neighbor_sampling.cpp)
+
+    ###############################################################################################
+    # - MG HETEROGENEOUS UNIFORM NBR SAMPLING tests -------------------------------------------------
+    ConfigureTestMG(
+        MG_HETEROGENEOUS_UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/mg_heterogeneous_uniform_neighbor_sampling.cpp)
+
+    ###############################################################################################
+    # - MG HOMOGENEOUS BIASED NBR SAMPLING tests --------------------------------------------------
+    ConfigureTestMG(
+        MG_HOMOGENEOUS_BIASED_NEIGHBOR_SAMPLING_TEST sampling/mg_homogeneous_biased_neighbor_sampling.cpp)
+
+    ###############################################################################################
+    # - MG HETEROGENEOUS BIASED NBR SAMPLING tests --------------------------------------------------
+    ConfigureTestMG(
+        MG_HETEROGENEOUS_BIASED_NEIGHBOR_SAMPLING_TEST sampling/mg_heterogeneous_biased_neighbor_sampling.cpp)
+
     ###############################################################################################
     # - MG BIASED NBR SAMPLING tests --------------------------------------------------------------
     ConfigureTestMG(MG_BIASED_NEIGHBOR_SAMPLING_TEST sampling/mg_biased_neighbor_sampling.cpp)
diff --git a/cpp/tests/sampling/heterogeneous_biased_neighbor_sampling.cpp b/cpp/tests/sampling/heterogeneous_biased_neighbor_sampling.cpp
new file mode 100644
index 00000000000..6ea00cf5104
--- /dev/null
+++ b/cpp/tests/sampling/heterogeneous_biased_neighbor_sampling.cpp
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/property_generator_utilities.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Heterogeneous_Biased_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  int32_t num_edge_types{1};
+  bool flag_replacement{true};
+  // FIXME: Edge masking is currently not tested because it will
+  // require attaching two masks (edge_type_t, bool_t) which
+  // is not currently supported. Once a primitive to support
+  // heterogeneous sampling is added, maintaining two masks
+  // won't be necessary
+  // bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Heterogeneous_Biased_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Heterogeneous_Biased_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_Heterogeneous_Biased_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<Heterogeneous_Biased_Neighbor_Sampling_Usecase const&,
+                                   input_usecase_t const&> const& param)
+  {
+    using edge_type_t = int32_t;
+
+    auto [heterogeneous_biased_neighbor_sampling_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, input_usecase, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+    auto edge_weight_view =
+      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
+
+    constexpr float select_probability{0.05};
+
+    // FIXME:  Update the tests to initialize RngState and use it instead
+    //         of seed...
+    constexpr uint64_t seed{0};
+
+    raft::random::RngState rng_state(seed);
+
+    auto random_sources = cugraph::select_random_vertices(
+      handle,
+      graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+      std::max(static_cast<size_t>(graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(graph_view.number_of_vertices()), size_t{1})),
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto batch_number = std::make_optional<rmm::device_uvector<int32_t>>(0, handle.get_stream());
+
+    batch_number =
+      cugraph::test::sequence(handle,
+                              random_sources.size(),
+                              heterogeneous_biased_neighbor_sampling_usecase.batch_size,
+                              int32_t{0});
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle.get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle.get_stream());
+
+    std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank_mapping{std::nullopt};
+
+    // Generate the edge types
+
+    std::optional<cugraph::edge_property_t<decltype(graph_view), edge_type_t>> edge_types{
+      std::nullopt};
+
+    if (heterogeneous_biased_neighbor_sampling_usecase.num_edge_types > 1) {
+      edge_types = cugraph::test::generate<decltype(graph_view), edge_type_t>::edge_property(
+        handle, graph_view, heterogeneous_biased_neighbor_sampling_usecase.num_edge_types);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Biased neighbor sampling");
+    }
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::heterogeneous_biased_neighbor_sample(
+        handle,
+        rng_state,
+        graph_view,
+        edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        edge_types
+          ? std::optional<cugraph::edge_property_view_t<edge_t, edge_type_t const*>>{(*edge_types)
+                                                                                       .view()}
+          : std::nullopt,
+        *edge_weight_view,
+        raft::device_span<vertex_t const>{random_sources_copy.data(), random_sources.size()},
+        batch_number ? std::make_optional(raft::device_span<int32_t const>{batch_number->data(),
+                                                                           batch_number->size()})
+                     : std::nullopt,
+        label_to_output_comm_rank_mapping,
+        raft::host_span<int32_t const>(
+          heterogeneous_biased_neighbor_sampling_usecase.fanout.data(),
+          heterogeneous_biased_neighbor_sampling_usecase.fanout.size()),
+        heterogeneous_biased_neighbor_sampling_usecase.num_edge_types,
+        cugraph::sampling_flags_t{cugraph::prior_sources_behavior_t{0},
+                                  true,   // return_hops
+                                  false,  // dedupe_sources
+                                  heterogeneous_biased_neighbor_sampling_usecase.flag_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (heterogeneous_biased_neighbor_sampling_usecase.check_correctness) {
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * src_out.size(), handle.get_stream());
+      raft::copy(vertices.data(), src_out.data(), src_out.size(), handle.get_stream());
+      raft::copy(
+        vertices.data() + src_out.size(), dst_out.data(), dst_out.size(), handle.get_stream());
+      vertices = cugraph::test::sort<vertex_t>(handle, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(handle, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle.get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle.get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        handle,
+        graph_view,
+        edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      ASSERT_TRUE(cugraph::test::validate_extracted_graph_is_subgraph(
+        handle, src_compare, dst_compare, wgt_compare, src_out, dst_out, wgt_out));
+
+      if (random_sources.size() < 100) {
+        // This validation is too expensive for large number of vertices
+        ASSERT_TRUE(cugraph::test::validate_sampling_depth(
+          handle,
+          std::move(src_out),
+          std::move(dst_out),
+          std::move(wgt_out),
+          std::move(random_sources),
+          heterogeneous_biased_neighbor_sampling_usecase.fanout.size()));
+      }
+    }
+  }
+};
+
+using Tests_Heterogeneous_Biased_Neighbor_Sampling_File =
+  Tests_Heterogeneous_Biased_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_Heterogeneous_Biased_Neighbor_Sampling_Rmat =
+  Tests_Heterogeneous_Biased_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_Heterogeneous_Biased_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Heterogeneous_Biased_Neighbor_Sampling_File, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Heterogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Heterogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Heterogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Heterogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Heterogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Heterogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(
+      Heterogeneous_Biased_Neighbor_Sampling_Usecase{
+        {4, 10, 7, 8, 1, 9, 5, 12}, 1024, 4, false, false},
+      Heterogeneous_Biased_Neighbor_Sampling_Usecase{
+        {4, 10, 7, 8, 1, 9, 5, 12}, 1024, 4, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0))));
+// #endif
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/heterogeneous_uniform_neighbor_sampling.cpp b/cpp/tests/sampling/heterogeneous_uniform_neighbor_sampling.cpp
new file mode 100644
index 00000000000..3b57aed4768
--- /dev/null
+++ b/cpp/tests/sampling/heterogeneous_uniform_neighbor_sampling.cpp
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/property_generator_utilities.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Heterogeneous_Uniform_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  int32_t num_edge_types{1};
+  bool flag_replacement{true};
+  // FIXME: Edge masking is currently not tested because it will
+  // require attaching two masks (edge_type_t, bool_t) which
+  // is not currently supported. Once a primitive to support
+  // heterogeneous sampling is added, maintaining two masks
+  // won't be necessary
+  // bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Heterogeneous_Uniform_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Heterogeneous_Uniform_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<Heterogeneous_Uniform_Neighbor_Sampling_Usecase const&,
+                                   input_usecase_t const&> const& param)
+  {
+    using edge_type_t = int32_t;
+
+    auto [heterogeneous_uniform_neighbor_sampling_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, input_usecase, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+    auto edge_weight_view =
+      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
+
+    constexpr float select_probability{0.05};
+
+    constexpr uint64_t seed{0};
+
+    raft::random::RngState rng_state(seed);
+
+    auto random_sources = cugraph::select_random_vertices(
+      handle,
+      graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+      std::max(static_cast<size_t>(graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(graph_view.number_of_vertices()), size_t{1})),
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto batch_number = std::make_optional<rmm::device_uvector<int32_t>>(0, handle.get_stream());
+
+    batch_number =
+      cugraph::test::sequence(handle,
+                              random_sources.size(),
+                              heterogeneous_uniform_neighbor_sampling_usecase.batch_size,
+                              int32_t{0});
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle.get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle.get_stream());
+
+    std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank_mapping{std::nullopt};
+
+    // Generate the edge types
+
+    std::optional<cugraph::edge_property_t<decltype(graph_view), int32_t>> edge_types{std::nullopt};
+
+    if (heterogeneous_uniform_neighbor_sampling_usecase.num_edge_types > 1) {
+      edge_types = cugraph::test::generate<decltype(graph_view), int32_t>::edge_property(
+        handle, graph_view, heterogeneous_uniform_neighbor_sampling_usecase.num_edge_types);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Uniform neighbor sampling");
+    }
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::heterogeneous_uniform_neighbor_sample(
+        handle,
+        rng_state,
+        graph_view,
+        edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        edge_types
+          ? std::optional<cugraph::edge_property_view_t<edge_t, edge_type_t const*>>{(*edge_types)
+                                                                                       .view()}
+          : std::nullopt,
+        raft::device_span<vertex_t const>{random_sources_copy.data(), random_sources.size()},
+        batch_number ? std::make_optional(raft::device_span<int32_t const>{batch_number->data(),
+                                                                           batch_number->size()})
+                     : std::nullopt,
+        label_to_output_comm_rank_mapping,
+        raft::host_span<int32_t const>(
+          heterogeneous_uniform_neighbor_sampling_usecase.fanout.data(),
+          heterogeneous_uniform_neighbor_sampling_usecase.fanout.size()),
+        heterogeneous_uniform_neighbor_sampling_usecase.num_edge_types,
+        cugraph::sampling_flags_t{
+          cugraph::prior_sources_behavior_t{0},
+          true,   // return_hops
+          false,  // dedupe_sources
+          heterogeneous_uniform_neighbor_sampling_usecase.flag_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (heterogeneous_uniform_neighbor_sampling_usecase.check_correctness) {
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * src_out.size(), handle.get_stream());
+      raft::copy(vertices.data(), src_out.data(), src_out.size(), handle.get_stream());
+      raft::copy(
+        vertices.data() + src_out.size(), dst_out.data(), dst_out.size(), handle.get_stream());
+      vertices = cugraph::test::sort<vertex_t>(handle, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(handle, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle.get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle.get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        handle,
+        graph_view,
+        edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      ASSERT_TRUE(cugraph::test::validate_extracted_graph_is_subgraph(
+        handle, src_compare, dst_compare, wgt_compare, src_out, dst_out, wgt_out));
+
+      if (random_sources.size() < 100) {
+        // This validation is too expensive for large number of vertices
+        ASSERT_TRUE(cugraph::test::validate_sampling_depth(
+          handle,
+          std::move(src_out),
+          std::move(dst_out),
+          std::move(wgt_out),
+          std::move(random_sources),
+          heterogeneous_uniform_neighbor_sampling_usecase.fanout.size()));
+      }
+    }
+  }
+};
+
+using Tests_Heterogeneous_Uniform_Neighbor_Sampling_File =
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_Heterogeneous_Uniform_Neighbor_Sampling_Rmat =
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_Heterogeneous_Uniform_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Heterogeneous_Uniform_Neighbor_Sampling_File, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Heterogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Heterogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Heterogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(
+      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{
+        {4, 10, 7, 8, 1, 9, 5, 12}, 1024, 4, false, false},
+      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{
+        {4, 10, 7, 8, 1, 9, 5, 12}, 1024, 4, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0))));
+// #endif
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/homogeneous_biased_neighbor_sampling.cpp b/cpp/tests/sampling/homogeneous_biased_neighbor_sampling.cpp
new file mode 100644
index 00000000000..14cf54e7d1c
--- /dev/null
+++ b/cpp/tests/sampling/homogeneous_biased_neighbor_sampling.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/property_generator_utilities.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Homogeneous_Biased_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  bool flag_replacement{true};
+
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Homogeneous_Biased_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Homogeneous_Biased_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_Homogeneous_Biased_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<Homogeneous_Biased_Neighbor_Sampling_Usecase const&,
+                                   input_usecase_t const&> const& param)
+  {
+    auto [homogeneous_biased_neighbor_sampling_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, input_usecase, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+    auto edge_weight_view =
+      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(graph_view), bool>> edge_mask{std::nullopt};
+    if (homogeneous_biased_neighbor_sampling_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<decltype(graph_view), bool>::edge_property(handle, graph_view, 2);
+      graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    constexpr float select_probability{0.05};
+
+    constexpr uint64_t seed{0};
+
+    raft::random::RngState rng_state(seed);
+
+    auto random_sources = cugraph::select_random_vertices(
+      handle,
+      graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+      std::max(static_cast<size_t>(graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(graph_view.number_of_vertices()), size_t{1})),
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto batch_number = std::make_optional<rmm::device_uvector<int32_t>>(0, handle.get_stream());
+
+    batch_number = cugraph::test::sequence(handle,
+                                           random_sources.size(),
+                                           homogeneous_biased_neighbor_sampling_usecase.batch_size,
+                                           int32_t{0});
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle.get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle.get_stream());
+
+    std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank_mapping{std::nullopt};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Biased neighbor sampling");
+    }
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::homogeneous_biased_neighbor_sample(
+        handle,
+        rng_state,
+        graph_view,
+        edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
+        *edge_weight_view,
+        raft::device_span<vertex_t const>{random_sources_copy.data(), random_sources.size()},
+        batch_number ? std::make_optional(raft::device_span<int32_t const>{batch_number->data(),
+                                                                           batch_number->size()})
+                     : std::nullopt,
+        label_to_output_comm_rank_mapping,
+        raft::host_span<int32_t const>(homogeneous_biased_neighbor_sampling_usecase.fanout.data(),
+                                       homogeneous_biased_neighbor_sampling_usecase.fanout.size()),
+        cugraph::sampling_flags_t{cugraph::prior_sources_behavior_t{0},
+                                  true,   // return_hops
+                                  false,  // dedupe_sources
+                                  homogeneous_biased_neighbor_sampling_usecase.flag_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (homogeneous_biased_neighbor_sampling_usecase.check_correctness) {
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * src_out.size(), handle.get_stream());
+      raft::copy(vertices.data(), src_out.data(), src_out.size(), handle.get_stream());
+      raft::copy(
+        vertices.data() + src_out.size(), dst_out.data(), dst_out.size(), handle.get_stream());
+      vertices = cugraph::test::sort<vertex_t>(handle, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(handle, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle.get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle.get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        handle,
+        graph_view,
+        edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      ASSERT_TRUE(cugraph::test::validate_extracted_graph_is_subgraph(
+        handle, src_compare, dst_compare, wgt_compare, src_out, dst_out, wgt_out));
+
+      if (random_sources.size() < 100) {
+        // This validation is too expensive for large number of vertices
+        ASSERT_TRUE(cugraph::test::validate_sampling_depth(
+          handle,
+          std::move(src_out),
+          std::move(dst_out),
+          std::move(wgt_out),
+          std::move(random_sources),
+          homogeneous_biased_neighbor_sampling_usecase.fanout.size()));
+      }
+    }
+  }
+};
+
+using Tests_Homogeneous_Biased_Neighbor_Sampling_File =
+  Tests_Homogeneous_Biased_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_Homogeneous_Biased_Neighbor_Sampling_Rmat =
+  Tests_Homogeneous_Biased_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_Homogeneous_Biased_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Homogeneous_Biased_Neighbor_Sampling_File, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Homogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Homogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Homogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Homogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Homogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Homogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 1024, false, false, false},
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 1024, false, true, false},
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 1024, true, false, false},
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 1024, true, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/homogeneous_uniform_neighbor_sampling.cpp b/cpp/tests/sampling/homogeneous_uniform_neighbor_sampling.cpp
new file mode 100644
index 00000000000..a257e424b3e
--- /dev/null
+++ b/cpp/tests/sampling/homogeneous_uniform_neighbor_sampling.cpp
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/property_generator_utilities.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Homogeneous_Uniform_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  bool flag_replacement{true};
+
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Homogeneous_Uniform_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Homogeneous_Uniform_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_Homogeneous_Uniform_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<Homogeneous_Uniform_Neighbor_Sampling_Usecase const&,
+                                   input_usecase_t const&> const& param)
+  {
+    auto [homogeneous_uniform_neighbor_sampling_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, input_usecase, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+    auto edge_weight_view =
+      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(graph_view), bool>> edge_mask{std::nullopt};
+    if (homogeneous_uniform_neighbor_sampling_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<decltype(graph_view), bool>::edge_property(handle, graph_view, 2);
+      graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    // FIXME: Read a tuple of two edge mask and mask out if edge mask is set in either 1 (OR) and
+    // create a new one. No graph view can have two mask and perform OR in itself, and need to OR
+    // the mask manually by itself.
+
+    constexpr float select_probability{0.05};
+
+    // FIXME:  Update the tests to initialize RngState and use it instead
+    //         of seed...
+    constexpr uint64_t seed{0};
+
+    raft::random::RngState rng_state(seed);
+
+    auto random_sources = cugraph::select_random_vertices(
+      handle,
+      graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+      std::max(static_cast<size_t>(graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(graph_view.number_of_vertices()), size_t{1})),
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto batch_number = std::make_optional<rmm::device_uvector<int32_t>>(0, handle.get_stream());
+
+    batch_number = cugraph::test::sequence(handle,
+                                           random_sources.size(),
+                                           homogeneous_uniform_neighbor_sampling_usecase.batch_size,
+                                           int32_t{0});
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle.get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle.get_stream());
+
+    std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank_mapping{std::nullopt};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Uniform neighbor sampling");
+    }
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::homogeneous_uniform_neighbor_sample(
+        handle,
+        rng_state,
+        graph_view,
+        edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
+        raft::device_span<vertex_t const>{random_sources_copy.data(), random_sources.size()},
+        batch_number ? std::make_optional(raft::device_span<int32_t const>{batch_number->data(),
+                                                                           batch_number->size()})
+                     : std::nullopt,
+        label_to_output_comm_rank_mapping,
+        raft::host_span<int32_t const>(homogeneous_uniform_neighbor_sampling_usecase.fanout.data(),
+                                       homogeneous_uniform_neighbor_sampling_usecase.fanout.size()),
+        cugraph::sampling_flags_t{cugraph::prior_sources_behavior_t{0},
+                                  true,   // return_hops
+                                  false,  // dedupe_sources
+                                  homogeneous_uniform_neighbor_sampling_usecase.flag_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (homogeneous_uniform_neighbor_sampling_usecase.check_correctness) {
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * src_out.size(), handle.get_stream());
+      raft::copy(vertices.data(), src_out.data(), src_out.size(), handle.get_stream());
+      raft::copy(
+        vertices.data() + src_out.size(), dst_out.data(), dst_out.size(), handle.get_stream());
+      vertices = cugraph::test::sort<vertex_t>(handle, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(handle, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle.get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle.get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        handle,
+        graph_view,
+        edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      ASSERT_TRUE(cugraph::test::validate_extracted_graph_is_subgraph(
+        handle, src_compare, dst_compare, wgt_compare, src_out, dst_out, wgt_out));
+
+      if (random_sources.size() < 100) {
+        // This validation is too expensive for large number of vertices
+        ASSERT_TRUE(cugraph::test::validate_sampling_depth(
+          handle,
+          std::move(src_out),
+          std::move(dst_out),
+          std::move(wgt_out),
+          std::move(random_sources),
+          homogeneous_uniform_neighbor_sampling_usecase.fanout.size()));
+      }
+    }
+  }
+};
+
+using Tests_Homogeneous_Uniform_Neighbor_Sampling_File =
+  Tests_Homogeneous_Uniform_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_Homogeneous_Uniform_Neighbor_Sampling_Rmat =
+  Tests_Homogeneous_Uniform_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_Homogeneous_Uniform_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Homogeneous_Uniform_Neighbor_Sampling_File, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Homogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Homogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_Homogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_Homogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Homogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false, 0))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Homogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 1024, false, false, false},
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 1024, false, true, false},
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 1024, true, false, false},
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 1024, true, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false, 0))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/mg_heterogeneous_biased_neighbor_sampling.cpp b/cpp/tests/sampling/mg_heterogeneous_biased_neighbor_sampling.cpp
new file mode 100644
index 00000000000..18d8491435d
--- /dev/null
+++ b/cpp/tests/sampling/mg_heterogeneous_biased_neighbor_sampling.cpp
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Heterogeneous_Biased_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  int32_t num_edge_types{1};
+  bool with_replacement{true};
+  // FIXME: Edge masking is currently not tested because it will
+  // require attaching two masks (edge_type_t, bool_t) which
+  // is not currently supported. Once a primitive to support
+  // heterogeneous sampling is added, maintaining two masks
+  // won't be necessary
+  // bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGHeterogeneous_Biased_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Heterogeneous_Biased_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(
+    std::tuple<Heterogeneous_Biased_Neighbor_Sampling_Usecase, input_usecase_t> const& param)
+  {
+    using edge_type_t = int32_t;
+
+    auto [heterogeneous_biased_neighbor_sampling_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG construct graph");
+    }
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_,
+        input_usecase,
+        true /* test_weighted */,
+        true /* renumber */,
+        false /* drop_self_loops */,
+        false /* drop_multi_edges */);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    //
+    // Test is designed like GNN sampling.  We'll select 5% of vertices to be included in sampling
+    // batches
+    //
+
+    constexpr float select_probability{0.05};
+
+    raft::random::RngState rng_state(handle_->get_comms().get_rank());
+
+    auto random_sources = cugraph::select_random_vertices(
+      *handle_,
+      mg_graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+
+      std::max(static_cast<size_t>(mg_graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(mg_graph_view.number_of_vertices()), size_t{1})),
+
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto seed_sizes = cugraph::host_scalar_allgather(
+      handle_->get_comms(), random_sources.size(), handle_->get_stream());
+    size_t num_seeds = std::reduce(seed_sizes.begin(), seed_sizes.end());
+    size_t num_batches =
+      (num_seeds + heterogeneous_biased_neighbor_sampling_usecase.batch_size - 1) /
+      heterogeneous_biased_neighbor_sampling_usecase.batch_size;
+
+    std::vector<size_t> seed_offsets(seed_sizes.size());
+    std::exclusive_scan(seed_sizes.begin(), seed_sizes.end(), seed_offsets.begin(), size_t{0});
+
+    auto batch_number = cugraph::test::modulo_sequence<int32_t>(
+      *handle_, random_sources.size(), num_batches, seed_offsets[handle_->get_comms().get_rank()]);
+
+    // Get unique batch_number -> label_list
+    rmm::device_uvector<int32_t> label_list(batch_number.size(), handle_->get_stream());
+
+    raft::copy(label_list.data(), batch_number.data(), batch_number.size(), handle_->get_stream());
+
+    label_list = cugraph::test::sort<int32_t>(*handle_, std::move(label_list));
+    label_list = cugraph::test::unique<int32_t>(*handle_, std::move(label_list));
+
+    auto num_unique_labels = label_list.size();
+
+    auto comm_ranks = cugraph::test::scalar_fill<int32_t>(
+      *handle_, num_unique_labels, int32_t{handle_->get_comms().get_rank()});
+
+    // perform allgatherv
+    comm_ranks = cugraph::test::device_allgatherv(*handle_, comm_ranks.data(), comm_ranks.size());
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle_->get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle_->get_stream());
+
+    // Generate the edge types
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), edge_type_t>> edge_types{
+      std::nullopt};
+
+    if (heterogeneous_biased_neighbor_sampling_usecase.num_edge_types > 1) {
+      edge_types = cugraph::test::generate<decltype(mg_graph_view), edge_type_t>::edge_property(
+        *handle_, mg_graph_view, heterogeneous_biased_neighbor_sampling_usecase.num_edge_types);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG biased_neighbor_sample");
+    }
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::heterogeneous_biased_neighbor_sample(
+        *handle_,
+        rng_state,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        edge_types
+          ? std::optional<cugraph::edge_property_view_t<edge_t, edge_type_t const*>>{(*edge_types)
+                                                                                       .view()}
+          : std::nullopt,
+        *mg_edge_weight_view,
+        raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()},
+        std::make_optional(
+          raft::device_span<int32_t const>{batch_number.data(), batch_number.size()}),
+        std::make_optional(raft::device_span<int32_t const>{comm_ranks.data(), comm_ranks.size()}),
+        raft::host_span<int32_t const>(
+          heterogeneous_biased_neighbor_sampling_usecase.fanout.data(),
+          heterogeneous_biased_neighbor_sampling_usecase.fanout.size()),
+        heterogeneous_biased_neighbor_sampling_usecase.num_edge_types,
+        cugraph::sampling_flags_t{cugraph::prior_sources_behavior_t{0},
+                                  true,   // return_hops
+                                  false,  // dedupe_sources
+                                  heterogeneous_biased_neighbor_sampling_usecase.with_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (heterogeneous_biased_neighbor_sampling_usecase.check_correctness) {
+      // Consolidate results on GPU 0
+      auto mg_start_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()});
+      auto mg_aggregate_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_out.data(), src_out.size()});
+      auto mg_aggregate_dst = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()});
+      auto mg_aggregate_wgt =
+        wgt_out ? std::make_optional(cugraph::test::device_gatherv(
+                    *handle_, raft::device_span<weight_t const>{wgt_out->data(), wgt_out->size()}))
+                : std::nullopt;
+
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(
+        vertices.data(), mg_aggregate_src.data(), mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(vertices.data() + mg_aggregate_src.size(),
+                 mg_aggregate_dst.data(),
+                 mg_aggregate_dst.size(),
+                 handle_->get_stream());
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      vertices = cugraph::detail::shuffle_int_vertices_to_local_gpu_by_vertex_partitioning(
+        *handle_, std::move(vertices), mg_graph_view.vertex_partition_range_lasts());
+
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle_->get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle_->get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle_->get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle_->get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      auto mg_aggregate_src_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_compare.data(), src_compare.size()});
+      auto mg_aggregate_dst_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_compare.data(), dst_compare.size()});
+      auto mg_aggregate_wgt_compare =
+        wgt_compare
+          ? std::make_optional(cugraph::test::device_gatherv(
+              *handle_,
+              raft::device_span<weight_t const>{wgt_compare->data(), wgt_compare->size()}))
+          : std::nullopt;
+
+      if (handle_->get_comms().get_rank() == 0) {
+        cugraph::test::validate_extracted_graph_is_subgraph(*handle_,
+                                                            mg_aggregate_src_compare,
+                                                            mg_aggregate_dst_compare,
+                                                            mg_aggregate_wgt_compare,
+                                                            mg_aggregate_src,
+                                                            mg_aggregate_dst,
+                                                            mg_aggregate_wgt);
+
+        if (random_sources.size() < 100) {
+          // This validation is too expensive for large number of vertices
+          if (mg_aggregate_src.size() > 0) {
+            cugraph::test::validate_sampling_depth(
+              *handle_,
+              std::move(mg_aggregate_src),
+              std::move(mg_aggregate_dst),
+              std::move(mg_aggregate_wgt),
+              std::move(mg_start_src),
+              heterogeneous_biased_neighbor_sampling_usecase.fanout.size());
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGHeterogeneous_Biased_Neighbor_Sampling_File =
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_MGHeterogeneous_Biased_Neighbor_Sampling_Rmat =
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGHeterogeneous_Biased_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHeterogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHeterogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(
+      // cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+      cugraph::test::Rmat_Usecase(5, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGHeterogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/mg_heterogeneous_uniform_neighbor_sampling.cpp b/cpp/tests/sampling/mg_heterogeneous_uniform_neighbor_sampling.cpp
new file mode 100644
index 00000000000..b6812b35170
--- /dev/null
+++ b/cpp/tests/sampling/mg_heterogeneous_uniform_neighbor_sampling.cpp
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Heterogeneous_Uniform_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  int32_t num_edge_types{1};
+  bool with_replacement{true};
+  // FIXME: Edge masking is currently not tested because it will
+  // require attaching two masks (edge_type_t, bool_t) which
+  // is not currently supported. Once a primitive to support
+  // heterogeneous sampling is added, maintaining two masks
+  // won't be necessary
+  // bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGHeterogeneous_Uniform_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Heterogeneous_Uniform_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(
+    std::tuple<Heterogeneous_Uniform_Neighbor_Sampling_Usecase, input_usecase_t> const& param)
+  {
+    using edge_type_t = int32_t;
+
+    auto [heterogeneous_uniform_neighbor_sampling_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG construct graph");
+    }
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_,
+        input_usecase,
+        true /* test_weighted */,
+        true /* renumber */,
+        false /* drop_self_loops */,
+        false /* drop_multi_edges */);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    //
+    // Test is designed like GNN sampling.  We'll select 5% of vertices to be included in sampling
+    // batches
+    //
+
+    constexpr float select_probability{0.05};
+
+    raft::random::RngState rng_state(handle_->get_comms().get_rank());
+
+    auto random_sources = cugraph::select_random_vertices(
+      *handle_,
+      mg_graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+
+      std::max(static_cast<size_t>(mg_graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(mg_graph_view.number_of_vertices()), size_t{1})),
+
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto seed_sizes = cugraph::host_scalar_allgather(
+      handle_->get_comms(), random_sources.size(), handle_->get_stream());
+    size_t num_seeds = std::reduce(seed_sizes.begin(), seed_sizes.end());
+    size_t num_batches =
+      (num_seeds + heterogeneous_uniform_neighbor_sampling_usecase.batch_size - 1) /
+      heterogeneous_uniform_neighbor_sampling_usecase.batch_size;
+
+    std::vector<size_t> seed_offsets(seed_sizes.size());
+    std::exclusive_scan(seed_sizes.begin(), seed_sizes.end(), seed_offsets.begin(), size_t{0});
+
+    auto batch_number = cugraph::test::modulo_sequence<int32_t>(
+      *handle_, random_sources.size(), num_batches, seed_offsets[handle_->get_comms().get_rank()]);
+
+    // Get unique batch_number -> label_list
+    rmm::device_uvector<int32_t> label_list(batch_number.size(), handle_->get_stream());
+
+    raft::copy(label_list.data(), batch_number.data(), batch_number.size(), handle_->get_stream());
+
+    label_list = cugraph::test::sort<int32_t>(*handle_, std::move(label_list));
+    label_list = cugraph::test::unique<int32_t>(*handle_, std::move(label_list));
+
+    auto num_unique_labels = label_list.size();
+
+    auto comm_ranks = cugraph::test::scalar_fill<int32_t>(
+      *handle_, num_unique_labels, int32_t{handle_->get_comms().get_rank()});
+
+    // perform allgatherv
+    comm_ranks = cugraph::test::device_allgatherv(*handle_, comm_ranks.data(), comm_ranks.size());
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle_->get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle_->get_stream());
+
+    // Generate the edge types
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), edge_type_t>> edge_types{
+      std::nullopt};
+
+    if (heterogeneous_uniform_neighbor_sampling_usecase.num_edge_types > 1) {
+      edge_types = cugraph::test::generate<decltype(mg_graph_view), edge_type_t>::edge_property(
+        *handle_, mg_graph_view, heterogeneous_uniform_neighbor_sampling_usecase.num_edge_types);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG uniform_neighbor_sample");
+    }
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::heterogeneous_uniform_neighbor_sample(
+        *handle_,
+        rng_state,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        edge_types
+          ? std::optional<cugraph::edge_property_view_t<edge_t, edge_type_t const*>>{(*edge_types)
+                                                                                       .view()}
+          : std::nullopt,
+        raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()},
+        std::make_optional(
+          raft::device_span<int32_t const>{batch_number.data(), batch_number.size()}),
+        std::make_optional(raft::device_span<int32_t const>{comm_ranks.data(), comm_ranks.size()}),
+        raft::host_span<int32_t const>(
+          heterogeneous_uniform_neighbor_sampling_usecase.fanout.data(),
+          heterogeneous_uniform_neighbor_sampling_usecase.fanout.size()),
+        heterogeneous_uniform_neighbor_sampling_usecase.num_edge_types,
+        cugraph::sampling_flags_t{
+          cugraph::prior_sources_behavior_t{0},
+          true,   // return_hops
+          false,  // dedupe_sources
+          heterogeneous_uniform_neighbor_sampling_usecase.with_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (heterogeneous_uniform_neighbor_sampling_usecase.check_correctness) {
+      // Consolidate results on GPU 0
+      auto mg_start_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()});
+      auto mg_aggregate_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_out.data(), src_out.size()});
+      auto mg_aggregate_dst = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()});
+      auto mg_aggregate_wgt =
+        wgt_out ? std::make_optional(cugraph::test::device_gatherv(
+                    *handle_, raft::device_span<weight_t const>{wgt_out->data(), wgt_out->size()}))
+                : std::nullopt;
+
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(
+        vertices.data(), mg_aggregate_src.data(), mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(vertices.data() + mg_aggregate_src.size(),
+                 mg_aggregate_dst.data(),
+                 mg_aggregate_dst.size(),
+                 handle_->get_stream());
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      vertices = cugraph::detail::shuffle_int_vertices_to_local_gpu_by_vertex_partitioning(
+        *handle_, std::move(vertices), mg_graph_view.vertex_partition_range_lasts());
+
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle_->get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle_->get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle_->get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle_->get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      auto mg_aggregate_src_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_compare.data(), src_compare.size()});
+      auto mg_aggregate_dst_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_compare.data(), dst_compare.size()});
+      auto mg_aggregate_wgt_compare =
+        wgt_compare
+          ? std::make_optional(cugraph::test::device_gatherv(
+              *handle_,
+              raft::device_span<weight_t const>{wgt_compare->data(), wgt_compare->size()}))
+          : std::nullopt;
+
+      if (handle_->get_comms().get_rank() == 0) {
+        cugraph::test::validate_extracted_graph_is_subgraph(*handle_,
+                                                            mg_aggregate_src_compare,
+                                                            mg_aggregate_dst_compare,
+                                                            mg_aggregate_wgt_compare,
+                                                            mg_aggregate_src,
+                                                            mg_aggregate_dst,
+                                                            mg_aggregate_wgt);
+
+        if (random_sources.size() < 100) {
+          // This validation is too expensive for large number of vertices
+          if (mg_aggregate_src.size() > 0) {
+            cugraph::test::validate_sampling_depth(
+              *handle_,
+              std::move(mg_aggregate_src),
+              std::move(mg_aggregate_dst),
+              std::move(mg_aggregate_wgt),
+              std::move(mg_start_src),
+              heterogeneous_uniform_neighbor_sampling_usecase.fanout.size());
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_File =
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_Rmat =
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(
+      // cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+      cugraph::test::Rmat_Usecase(5, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGHeterogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, false},
+                      Heterogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10, 7, 8}, 128, 2, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/mg_homogeneous_biased_neighbor_sampling.cpp b/cpp/tests/sampling/mg_homogeneous_biased_neighbor_sampling.cpp
new file mode 100644
index 00000000000..ce153fd3f75
--- /dev/null
+++ b/cpp/tests/sampling/mg_homogeneous_biased_neighbor_sampling.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Homogeneous_Biased_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  bool with_replacement{true};
+
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGHomogeneous_Biased_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Homogeneous_Biased_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(
+    std::tuple<Homogeneous_Biased_Neighbor_Sampling_Usecase, input_usecase_t> const& param)
+  {
+    auto [homogeneous_biased_neighbor_sampling_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG construct graph");
+    }
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_,
+        input_usecase,
+        true /* test_weighted */,
+        true /* renumber */,
+        false /* drop_self_loops */,
+        false /* drop_multi_edges */);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (homogeneous_biased_neighbor_sampling_usecase.edge_masking) {
+      edge_mask = cugraph::test::generate<decltype(mg_graph_view), bool>::edge_property(
+        *handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    //
+    // Test is designed like GNN sampling.  We'll select 5% of vertices to be included in sampling
+    // batches
+    //
+
+    constexpr float select_probability{0.05};
+
+    raft::random::RngState rng_state(handle_->get_comms().get_rank());
+
+    auto random_sources = cugraph::select_random_vertices(
+      *handle_,
+      mg_graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+
+      std::max(static_cast<size_t>(mg_graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(mg_graph_view.number_of_vertices()), size_t{1})),
+
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto seed_sizes = cugraph::host_scalar_allgather(
+      handle_->get_comms(), random_sources.size(), handle_->get_stream());
+    size_t num_seeds   = std::reduce(seed_sizes.begin(), seed_sizes.end());
+    size_t num_batches = (num_seeds + homogeneous_biased_neighbor_sampling_usecase.batch_size - 1) /
+                         homogeneous_biased_neighbor_sampling_usecase.batch_size;
+
+    std::vector<size_t> seed_offsets(seed_sizes.size());
+    std::exclusive_scan(seed_sizes.begin(), seed_sizes.end(), seed_offsets.begin(), size_t{0});
+
+    auto batch_number = cugraph::test::modulo_sequence<int32_t>(
+      *handle_, random_sources.size(), num_batches, seed_offsets[handle_->get_comms().get_rank()]);
+
+    // Get unique batch_number -> label_list
+    rmm::device_uvector<int32_t> label_list(batch_number.size(), handle_->get_stream());
+
+    raft::copy(label_list.data(), batch_number.data(), batch_number.size(), handle_->get_stream());
+
+    label_list = cugraph::test::sort<int32_t>(*handle_, std::move(label_list));
+    label_list = cugraph::test::unique<int32_t>(*handle_, std::move(label_list));
+
+    auto num_unique_labels = label_list.size();
+
+    auto comm_ranks = cugraph::test::scalar_fill<int32_t>(
+      *handle_, num_unique_labels, int32_t{handle_->get_comms().get_rank()});
+
+    // perform allgatherv
+    comm_ranks = cugraph::test::device_allgatherv(*handle_, comm_ranks.data(), comm_ranks.size());
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle_->get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle_->get_stream());
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG homogeneous_biased_neighbor_sample");
+    }
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::homogeneous_biased_neighbor_sample(
+        *handle_,
+        rng_state,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
+        *mg_edge_weight_view,
+        raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()},
+        std::make_optional(
+          raft::device_span<int32_t const>{batch_number.data(), batch_number.size()}),
+        std::make_optional(raft::device_span<int32_t const>{comm_ranks.data(), comm_ranks.size()}),
+        raft::host_span<int32_t const>(homogeneous_biased_neighbor_sampling_usecase.fanout.data(),
+                                       homogeneous_biased_neighbor_sampling_usecase.fanout.size()),
+
+        cugraph::sampling_flags_t{cugraph::prior_sources_behavior_t{0},
+                                  true,   // return_hops
+                                  false,  // dedupe_sources
+                                  homogeneous_biased_neighbor_sampling_usecase.with_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (homogeneous_biased_neighbor_sampling_usecase.check_correctness) {
+      // Consolidate results on GPU 0
+      auto mg_start_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()});
+      auto mg_aggregate_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_out.data(), src_out.size()});
+      auto mg_aggregate_dst = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()});
+      auto mg_aggregate_wgt =
+        wgt_out ? std::make_optional(cugraph::test::device_gatherv(
+                    *handle_, raft::device_span<weight_t const>{wgt_out->data(), wgt_out->size()}))
+                : std::nullopt;
+
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(
+        vertices.data(), mg_aggregate_src.data(), mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(vertices.data() + mg_aggregate_src.size(),
+                 mg_aggregate_dst.data(),
+                 mg_aggregate_dst.size(),
+                 handle_->get_stream());
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      vertices = cugraph::detail::shuffle_int_vertices_to_local_gpu_by_vertex_partitioning(
+        *handle_, std::move(vertices), mg_graph_view.vertex_partition_range_lasts());
+
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle_->get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle_->get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle_->get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle_->get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      auto mg_aggregate_src_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_compare.data(), src_compare.size()});
+      auto mg_aggregate_dst_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_compare.data(), dst_compare.size()});
+      auto mg_aggregate_wgt_compare =
+        wgt_compare
+          ? std::make_optional(cugraph::test::device_gatherv(
+              *handle_,
+              raft::device_span<weight_t const>{wgt_compare->data(), wgt_compare->size()}))
+          : std::nullopt;
+
+      if (handle_->get_comms().get_rank() == 0) {
+        cugraph::test::validate_extracted_graph_is_subgraph(*handle_,
+                                                            mg_aggregate_src_compare,
+                                                            mg_aggregate_dst_compare,
+                                                            mg_aggregate_wgt_compare,
+                                                            mg_aggregate_src,
+                                                            mg_aggregate_dst,
+                                                            mg_aggregate_wgt);
+
+        if (random_sources.size() < 100) {
+          // This validation is too expensive for large number of vertices
+          if (mg_aggregate_src.size() > 0) {
+            cugraph::test::validate_sampling_depth(
+              *handle_,
+              std::move(mg_aggregate_src),
+              std::move(mg_aggregate_dst),
+              std::move(mg_aggregate_wgt),
+              std::move(mg_start_src),
+              homogeneous_biased_neighbor_sampling_usecase.fanout.size());
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGHomogeneous_Biased_Neighbor_Sampling_File =
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_MGHomogeneous_Biased_Neighbor_Sampling_Rmat =
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGHomogeneous_Biased_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHomogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHomogeneous_Biased_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(
+      // cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+      cugraph::test::Rmat_Usecase(5, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGHomogeneous_Biased_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false, false},
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true, false},
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false, false},
+      Homogeneous_Biased_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/mg_homogeneous_uniform_neighbor_sampling.cpp b/cpp/tests/sampling/mg_homogeneous_uniform_neighbor_sampling.cpp
new file mode 100644
index 00000000000..88f2b8e28c8
--- /dev/null
+++ b/cpp/tests/sampling/mg_homogeneous_uniform_neighbor_sampling.cpp
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/nbr_sampling_validate.hpp"
+#include "utilities/base_fixture.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <gtest/gtest.h>
+
+struct Homogeneous_Uniform_Neighbor_Sampling_Usecase {
+  std::vector<int32_t> fanout{{-1}};
+  int32_t batch_size{10};
+  bool with_replacement{true};
+
+  bool edge_masking{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGHomogeneous_Uniform_Neighbor_Sampling
+  : public ::testing::TestWithParam<
+      std::tuple<Homogeneous_Uniform_Neighbor_Sampling_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(
+    std::tuple<Homogeneous_Uniform_Neighbor_Sampling_Usecase, input_usecase_t> const& param)
+  {
+    auto [homogeneous_uniform_neighbor_sampling_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG construct graph");
+    }
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_,
+        input_usecase,
+        true /* test_weighted */,
+        true /* renumber */,
+        false /* drop_self_loops */,
+        false /* drop_multi_edges */);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (homogeneous_uniform_neighbor_sampling_usecase.edge_masking) {
+      edge_mask = cugraph::test::generate<decltype(mg_graph_view), bool>::edge_property(
+        *handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    //
+    // Test is designed like GNN sampling.  We'll select 5% of vertices to be included in sampling
+    // batches
+    //
+
+    constexpr float select_probability{0.05};
+
+    raft::random::RngState rng_state(handle_->get_comms().get_rank());
+
+    auto random_sources = cugraph::select_random_vertices(
+      *handle_,
+      mg_graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+
+      std::max(static_cast<size_t>(mg_graph_view.number_of_vertices() * select_probability),
+               std::min(static_cast<size_t>(mg_graph_view.number_of_vertices()), size_t{1})),
+
+      false,
+      false);
+
+    //
+    //  Now we'll assign the vertices to batches
+    //
+
+    auto seed_sizes = cugraph::host_scalar_allgather(
+      handle_->get_comms(), random_sources.size(), handle_->get_stream());
+    size_t num_seeds = std::reduce(seed_sizes.begin(), seed_sizes.end());
+    size_t num_batches =
+      (num_seeds + homogeneous_uniform_neighbor_sampling_usecase.batch_size - 1) /
+      homogeneous_uniform_neighbor_sampling_usecase.batch_size;
+
+    std::vector<size_t> seed_offsets(seed_sizes.size());
+    std::exclusive_scan(seed_sizes.begin(), seed_sizes.end(), seed_offsets.begin(), size_t{0});
+
+    auto batch_number = cugraph::test::modulo_sequence<int32_t>(
+      *handle_, random_sources.size(), num_batches, seed_offsets[handle_->get_comms().get_rank()]);
+
+    // Get unique batch_number -> label_list
+    rmm::device_uvector<int32_t> label_list(batch_number.size(), handle_->get_stream());
+
+    raft::copy(label_list.data(), batch_number.data(), batch_number.size(), handle_->get_stream());
+
+    label_list = cugraph::test::sort<int32_t>(*handle_, std::move(label_list));
+    label_list = cugraph::test::unique<int32_t>(*handle_, std::move(label_list));
+
+    auto num_unique_labels = label_list.size();
+
+    auto comm_ranks = cugraph::test::scalar_fill<int32_t>(
+      *handle_, num_unique_labels, int32_t{handle_->get_comms().get_rank()});
+
+    // perform allgatherv
+    comm_ranks = cugraph::test::device_allgatherv(*handle_, comm_ranks.data(), comm_ranks.size());
+
+    rmm::device_uvector<vertex_t> random_sources_copy(random_sources.size(), handle_->get_stream());
+
+    raft::copy(random_sources_copy.data(),
+               random_sources.data(),
+               random_sources.size(),
+               handle_->get_stream());
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG uniform_neighbor_sample");
+    }
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+    auto&& [src_out, dst_out, wgt_out, edge_id, edge_type, hop, offsets] =
+      cugraph::homogeneous_uniform_neighbor_sample(
+        *handle_,
+        rng_state,
+        mg_graph_view,
+        mg_edge_weight_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
+        raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()},
+        std::make_optional(
+          raft::device_span<int32_t const>{batch_number.data(), batch_number.size()}),
+        std::make_optional(raft::device_span<int32_t const>{comm_ranks.data(), comm_ranks.size()}),
+        raft::host_span<int32_t const>(homogeneous_uniform_neighbor_sampling_usecase.fanout.data(),
+                                       homogeneous_uniform_neighbor_sampling_usecase.fanout.size()),
+
+        cugraph::sampling_flags_t{cugraph::prior_sources_behavior_t{0},
+                                  true,   // return_hops
+                                  false,  // dedupe_sources
+                                  homogeneous_uniform_neighbor_sampling_usecase.with_replacement});
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (homogeneous_uniform_neighbor_sampling_usecase.check_correctness) {
+      // Consolidate results on GPU 0
+      auto mg_start_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{random_sources.data(), random_sources.size()});
+      auto mg_aggregate_src = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_out.data(), src_out.size()});
+      auto mg_aggregate_dst = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_out.data(), dst_out.size()});
+      auto mg_aggregate_wgt =
+        wgt_out ? std::make_optional(cugraph::test::device_gatherv(
+                    *handle_, raft::device_span<weight_t const>{wgt_out->data(), wgt_out->size()}))
+                : std::nullopt;
+
+      //  First validate that the extracted edges are actually a subset of the
+      //  edges in the input graph
+      rmm::device_uvector<vertex_t> vertices(2 * mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(
+        vertices.data(), mg_aggregate_src.data(), mg_aggregate_src.size(), handle_->get_stream());
+      raft::copy(vertices.data() + mg_aggregate_src.size(),
+                 mg_aggregate_dst.data(),
+                 mg_aggregate_dst.size(),
+                 handle_->get_stream());
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      vertices = cugraph::detail::shuffle_int_vertices_to_local_gpu_by_vertex_partitioning(
+        *handle_, std::move(vertices), mg_graph_view.vertex_partition_range_lasts());
+
+      vertices = cugraph::test::sort<vertex_t>(*handle_, std::move(vertices));
+      vertices = cugraph::test::unique<vertex_t>(*handle_, std::move(vertices));
+
+      rmm::device_uvector<size_t> d_subgraph_offsets(2, handle_->get_stream());
+      std::vector<size_t> h_subgraph_offsets({0, vertices.size()});
+
+      raft::update_device(d_subgraph_offsets.data(),
+                          h_subgraph_offsets.data(),
+                          h_subgraph_offsets.size(),
+                          handle_->get_stream());
+
+      rmm::device_uvector<vertex_t> src_compare(0, handle_->get_stream());
+      rmm::device_uvector<vertex_t> dst_compare(0, handle_->get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt_compare{std::nullopt};
+      std::tie(src_compare, dst_compare, wgt_compare, std::ignore) = extract_induced_subgraphs(
+        *handle_,
+        mg_graph_view,
+        mg_edge_weight_view,
+        raft::device_span<size_t const>(d_subgraph_offsets.data(), 2),
+        raft::device_span<vertex_t const>(vertices.data(), vertices.size()),
+        true);
+
+      auto mg_aggregate_src_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{src_compare.data(), src_compare.size()});
+      auto mg_aggregate_dst_compare = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>{dst_compare.data(), dst_compare.size()});
+      auto mg_aggregate_wgt_compare =
+        wgt_compare
+          ? std::make_optional(cugraph::test::device_gatherv(
+              *handle_,
+              raft::device_span<weight_t const>{wgt_compare->data(), wgt_compare->size()}))
+          : std::nullopt;
+
+      if (handle_->get_comms().get_rank() == 0) {
+        cugraph::test::validate_extracted_graph_is_subgraph(*handle_,
+                                                            mg_aggregate_src_compare,
+                                                            mg_aggregate_dst_compare,
+                                                            mg_aggregate_wgt_compare,
+                                                            mg_aggregate_src,
+                                                            mg_aggregate_dst,
+                                                            mg_aggregate_wgt);
+
+        if (random_sources.size() < 100) {
+          // This validation is too expensive for large number of vertices
+          if (mg_aggregate_src.size() > 0) {
+            cugraph::test::validate_sampling_depth(
+              *handle_,
+              std::move(mg_aggregate_src),
+              std::move(mg_aggregate_dst),
+              std::move(mg_aggregate_wgt),
+              std::move(mg_start_src),
+              homogeneous_uniform_neighbor_sampling_usecase.fanout.size());
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t>
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGHomogeneous_Uniform_Neighbor_Sampling_File =
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling<cugraph::test::File_Usecase>;
+
+using Tests_MGHomogeneous_Uniform_Neighbor_Sampling_Rmat =
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGHomogeneous_Uniform_Neighbor_Sampling_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHomogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGHomogeneous_Uniform_Neighbor_Sampling_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_large_test,
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling_File,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false},
+                      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true}),
+    ::testing::Values(
+      // cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+      cugraph::test::Rmat_Usecase(5, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGHomogeneous_Uniform_Neighbor_Sampling_Rmat,
+  ::testing::Combine(
+    ::testing::Values(
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, false, false},
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, false, true, false},
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, false, false},
+      Homogeneous_Uniform_Neighbor_Sampling_Usecase{{4, 10}, 128, true, true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/thrust_wrapper.cu b/cpp/tests/utilities/thrust_wrapper.cu
index ef1c4f831eb..539de42845f 100644
--- a/cpp/tests/utilities/thrust_wrapper.cu
+++ b/cpp/tests/utilities/thrust_wrapper.cu
@@ -41,6 +41,7 @@ namespace test {
 
 template <typename value_t>
 cugraph::dataframe_buffer_type_t<value_t> sort(
+
   raft::handle_t const& handle, cugraph::dataframe_buffer_type_t<value_t> const& values)
 {
   auto sorted_values = cugraph::allocate_dataframe_buffer<value_t>(
@@ -403,6 +404,25 @@ template rmm::device_uvector<int64_t> sequence(raft::handle_t const& handle,
                                                size_t repeat_count,
                                                int64_t init);
 
+template <typename value_t>
+cugraph::dataframe_buffer_type_t<value_t> scalar_fill(raft::handle_t const& handle,
+                                                      size_t length,
+                                                      value_t value)
+{
+  auto values = cugraph::allocate_dataframe_buffer<value_t>(length, handle.get_stream());
+
+  thrust::tabulate(
+    handle.get_thrust_policy(), values.begin(), values.end(), [value] __device__(size_t i) {
+      return value;
+    });
+
+  return values;
+}
+
+template rmm::device_uvector<int32_t> scalar_fill(raft::handle_t const& handle,
+                                                  size_t length,
+                                                  int32_t value);
+
 template <typename value_t>
 cugraph::dataframe_buffer_type_t<value_t> modulo_sequence(raft::handle_t const& handle,
                                                           size_t length,
diff --git a/cpp/tests/utilities/thrust_wrapper.hpp b/cpp/tests/utilities/thrust_wrapper.hpp
index afdff33d80a..3d23d92d6cb 100644
--- a/cpp/tests/utilities/thrust_wrapper.hpp
+++ b/cpp/tests/utilities/thrust_wrapper.hpp
@@ -73,6 +73,11 @@ cugraph::dataframe_buffer_type_t<value_t> sequence(raft::handle_t const& handle,
                                                    size_t repeat_count,
                                                    value_t init);
 
+template <typename value_t>
+cugraph::dataframe_buffer_type_t<value_t> scalar_fill(raft::handle_t const& handle,
+                                                      size_t length,
+                                                      value_t value);
+
 // return (init + i) % modulo, where i = [0, length)
 template <typename value_t>
 cugraph::dataframe_buffer_type_t<value_t> modulo_sequence(raft::handle_t const& handle,
diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
index 3a53c7d16c3..fb46030bc56 100644
--- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
@@ -65,6 +65,10 @@ set(cython_sources
     all_pairs_sorensen_coefficients.pyx
     all_pairs_overlap_coefficients.pyx
     all_pairs_cosine_coefficients.pyx
+    heterogeneous_biased_neighbor_sample.pyx
+    heterogeneous_uniform_neighbor_sample.pyx
+    homogeneous_biased_neighbor_sample.pyx
+    homogeneous_uniform_neighbor_sample.pyx
     edge_id_lookup_table.pyx
 )
 set(linked_libraries cugraph::cugraph;cugraph::cugraph_c)
diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py
index 9c04a528fd8..5aa351f9ce1 100644
--- a/python/pylibcugraph/pylibcugraph/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/__init__.py
@@ -43,6 +43,19 @@
 
 from pylibcugraph.biased_neighbor_sample import biased_neighbor_sample
 
+from pylibcugraph.homogeneous_uniform_neighbor_sample import (
+    homogeneous_uniform_neighbor_sample,
+)
+from pylibcugraph.homogeneous_biased_neighbor_sample import (
+    homogeneous_biased_neighbor_sample,
+)
+from pylibcugraph.heterogeneous_uniform_neighbor_sample import (
+    heterogeneous_uniform_neighbor_sample,
+)
+from pylibcugraph.heterogeneous_biased_neighbor_sample import (
+    heterogeneous_biased_neighbor_sample,
+)
+
 from pylibcugraph.negative_sampling import negative_sampling
 
 from pylibcugraph.core_number import core_number
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
index 6d5d5a23cca..21f5190ad5f 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
@@ -178,6 +178,16 @@ cdef extern from "cugraph_c/algorithms.h":
             const cugraph_sample_result_t* result
         )
 
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_sample_result_get_edge_renumber_map(
+            const cugraph_sample_result_t* result
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_sample_result_get_edge_renumber_map_offsets(
+            const cugraph_sample_result_t* result
+        )
+
     # Deprecated, use cugraph_sample_result_get_majors
     cdef cugraph_type_erased_device_array_view_t* \
         cugraph_sample_result_get_sources(
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
index 3f7b8b9ae29..762fd37a35d 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
@@ -67,6 +67,62 @@ cdef extern from "cugraph_c/sampling_algorithms.h":
         cugraph_error_t** error
     )
 
+    cdef cugraph_error_code_t cugraph_heterogeneous_uniform_neighbor_sample(
+        const cugraph_resource_handle_t* handle,
+        cugraph_rng_state_t* rng_state,
+        cugraph_graph_t* graph,
+        const cugraph_type_erased_device_array_view_t* start_vertices,
+        const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+        const cugraph_type_erased_host_array_view_t* fan_out,
+        int num_edge_types,
+        const cugraph_sampling_options_t* options,
+        bool_t do_expensive_check,
+        cugraph_sample_result_t** result,
+        cugraph_error_t** error
+    )
+
+    cdef cugraph_error_code_t cugraph_heterogeneous_biased_neighbor_sample(
+        const cugraph_resource_handle_t* handle,
+        cugraph_rng_state_t* rng_state,
+        cugraph_graph_t* graph,
+        const cugraph_edge_property_view_t* edge_biases,
+        const cugraph_type_erased_device_array_view_t* start_vertices,
+        const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+        const cugraph_type_erased_host_array_view_t* fan_out,
+        int num_edge_types,
+        const cugraph_sampling_options_t* options,
+        bool_t do_expensive_check,
+        cugraph_sample_result_t** result,
+        cugraph_error_t** error
+    )
+
+    cdef cugraph_error_code_t cugraph_homogeneous_uniform_neighbor_sample(
+        const cugraph_resource_handle_t* handle,
+        cugraph_rng_state_t* rng_state,
+        cugraph_graph_t* graph,
+        const cugraph_type_erased_device_array_view_t* start_vertices,
+        const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+        const cugraph_type_erased_host_array_view_t* fan_out,
+        const cugraph_sampling_options_t* options,
+        bool_t do_expensive_check,
+        cugraph_sample_result_t** result,
+        cugraph_error_t** error
+    )
+
+    cdef cugraph_error_code_t cugraph_homogeneous_biased_neighbor_sample(
+        const cugraph_resource_handle_t* handle,
+        cugraph_rng_state_t* rng_state,
+        cugraph_graph_t* graph,
+        const cugraph_edge_property_view_t* edge_biases,
+        const cugraph_type_erased_device_array_view_t* start_vertices,
+        const cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets,
+        const cugraph_type_erased_host_array_view_t* fan_out,
+        const cugraph_sampling_options_t* options,
+        bool_t do_expensive_check,
+        cugraph_sample_result_t** result,
+        cugraph_error_t** error
+    )
+
     cdef cugraph_error_code_t cugraph_biased_neighbor_sample(
         const cugraph_resource_handle_t* handle,
         cugraph_graph_t* graph,
diff --git a/python/pylibcugraph/pylibcugraph/heterogeneous_biased_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/heterogeneous_biased_neighbor_sample.pyx
new file mode 100644
index 00000000000..ecdfba3afc5
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/heterogeneous_biased_neighbor_sample.pyx
@@ -0,0 +1,428 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from pylibcugraph._cugraph_c.types cimport (
+    bool_t,
+    SIZE_T
+)
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.properties cimport (
+    cugraph_edge_property_view_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_create,
+    cugraph_type_erased_device_array_view_free,
+    cugraph_type_erased_host_array_view_t,
+    cugraph_type_erased_host_array_view_create,
+    cugraph_type_erased_host_array_view_free,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.algorithms cimport (
+    cugraph_sample_result_t,
+    cugraph_prior_sources_behavior_t,
+    cugraph_compression_type_t,
+    cugraph_sampling_options_t,
+    cugraph_sampling_options_create,
+    cugraph_sampling_options_free,
+    cugraph_sampling_set_with_replacement,
+    cugraph_sampling_set_return_hops,
+    cugraph_sampling_set_prior_sources_behavior,
+    cugraph_sampling_set_dedupe_sources,
+    cugraph_sampling_set_renumber_results,
+    cugraph_sampling_set_compress_per_hop,
+    cugraph_sampling_set_compression_type,
+    cugraph_sampling_set_retain_seeds,
+)
+from pylibcugraph._cugraph_c.sampling_algorithms cimport (
+    cugraph_heterogeneous_biased_neighbor_sample,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    assert_CAI_type,
+    assert_AI_type,
+    get_c_type_from_numpy_type,
+)
+from pylibcugraph.internal_types.sampling_result cimport (
+    SamplingResult,
+)
+from pylibcugraph._cugraph_c.random cimport (
+    cugraph_rng_state_t
+)
+from pylibcugraph.random cimport (
+    CuGraphRandomState
+)
+import warnings
+
+# TODO accept cupy/numpy random state in addition to raw seed.
+def heterogeneous_biased_neighbor_sample(ResourceHandle resource_handle,
+                                         _GPUGraph input_graph,
+                                         start_vertex_list,
+                                         starting_vertex_label_offsets,
+                                         h_fan_out,
+                                         num_edge_types,
+                                         bool_t with_replacement,
+                                         bool_t do_expensive_check,
+                                         prior_sources_behavior=None,
+                                         deduplicate_sources=False,
+                                         return_hops=False,
+                                         renumber=False,
+                                         retain_seeds=False,
+                                         compression='COO',
+                                         compress_per_hop=False,
+                                         random_state=None):
+    """
+    Performs biased neighborhood sampling, which samples nodes from
+    a graph based on the current node's neighbors, with a corresponding fan_out
+    value at each hop. The edges are sampled with biases. Heterogeneous
+    neighborhood sampling translates to more than 1 edge types.
+
+    Parameters
+    ----------
+    resource_handle: ResourceHandle
+        Handle to the underlying device and host resources needed for
+        referencing data and running algorithms.
+
+    input_graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    edge_biases: not supported.
+
+    start_vertex_list: device array type
+        Device array containing the list of starting vertices for sampling.
+
+    starting_vertex_label_offsets: device array type (Optional)
+        Offsets of each label within the start vertex list. Expanding
+        'starting_vertex_label_offsets' must lead to an array of
+        len(start_vertex_list)
+
+    h_fan_out: numpy array type
+        Device array containing the branching out (fan-out) degrees per
+        starting vertex for each hop level. The fanout value at each hop for each
+        edge type is given by the relationship
+        h_fanout[x*num_edge_types + edge_type_id]
+
+        The sampling method can use different fan_out values for each edge type
+        which is not the case for homogeneous neighborhood sampling (both biased
+        and uniform).
+
+    num_edge_types: int
+        Number of edge types where a value of 1 translates to homogeneous neighbor
+        sample whereas a value greater than 1 translates to heterogeneous neighbor
+        sample.
+
+    with_replacement: bool
+        If true, sampling procedure is done with replacement (the same vertex
+        can be selected multiple times in the same step).
+
+    do_expensive_check: bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    prior_sources_behavior: str (Optional)
+        Options are "carryover", and "exclude".
+        Default will leave the source list as-is.
+        Carryover will carry over sources from previous hops to the
+        current hop.
+        Exclude will exclude sources from previous hops from reappearing
+        as sources in future hops.
+
+    deduplicate_sources: bool (Optional)
+        If True, will deduplicate the source list before sampling.
+        Defaults to False.
+
+    renumber: bool (Optional)
+        If True, will renumber the sources and destinations on a
+        per-batch basis and return the renumber map and batch offsets
+        in additional to the standard returns.
+
+    retain_seeds: bool (Optional)
+        If True, will retain the original seeds (original source vertices)
+        in the output even if they do not have outgoing neighbors.
+        Defaults to False.
+
+    compression: str (Optional)
+        Options: COO (default), CSR, CSC, DCSR, DCSR
+        Sets the compression format for the returned samples.
+
+    compress_per_hop: bool (Optional)
+        If False (default), will create a compressed edgelist for the
+        entire batch.
+        If True, will create a separate compressed edgelist per hop within
+        a batch.
+
+    random_state: int (Optional)
+        Random state to use when generating samples.  Optional argument,
+        defaults to a hash of process id, time, and hostname.
+        (See pylibcugraph.random.CuGraphRandomState)
+
+    Returns
+    -------
+    A tuple of device arrays, where the first and second items in the tuple
+    are device arrays containing the starting and ending vertices of each
+    walk respectively, the third item in the tuple is a device array
+    containing the start labels, and the fourth item in the tuple is a device
+    array containing the indices for reconstructing paths.
+
+    If renumber was set to True, then the fifth item in the tuple is a device
+    array containing the renumber map, and the sixth item in the tuple is a
+    device array containing the renumber map offsets (which delineate where
+    the renumber map for each batch starts).
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5],
+    ...                     dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4],
+    ...                     dtype=numpy.int32)
+    >>> weights = cupy.asarray([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2, 0.1, 2.1,
+    ...                         1.1, 5.1, 3.1,  4.1, 7.2, 3.2], dtype=numpy.float32)
+    >>> edge_types = cupy.asarray([0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1],
+    ...                            dtype=numpy.int32)
+    >>> start_vertices = cupy.asarray([2, 5, 1]).astype(numpy.int32)
+    >>> num_edge_types = 2
+    >>> starting_vertex_label_offsets = cupy.asarray([0, 2, 3])
+    >>> h_fan_out = numpy.array([2]).astype(numpy.int32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=False, renumber=False, do_expensive_check=False)
+    >>> sampling_results = pylibcugraph.heterogeneous_biased_neighbor_sample(
+    ...         resource_handle, G, start_vertices, starting_vertex_label_offsets,
+    ...         h_fan_out, num_edge_types, False, True)
+    >>> sampling_results
+    {'majors': array([2, 2, 2, 5, 5, 1, 1, 1, 1], dtype=int32),
+     'minors': array([0, 1, 3, 3, 4, 0, 2, 3, 4], dtype=int32),
+     'weight': array([5.1, 3.1, 4.1, 7.2, 3.2, 0.1, 3.1, 2.1, 1.1], dtype=float32)}
+
+    """
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = (
+        resource_handle.c_resource_handle_ptr
+    )
+
+    cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr
+    cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = <cugraph_type_erased_host_array_view_t*>NULL
+
+    cdef bool_t c_deduplicate_sources = deduplicate_sources
+    cdef bool_t c_return_hops = return_hops
+    cdef bool_t c_renumber = renumber
+    cdef bool_t c_compress_per_hop = compress_per_hop
+
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+    cdef uintptr_t ai_fan_out_ptr
+
+    # FIXME: refactor the way we are creating pointer. Can use a single helper function to create
+
+    assert_CAI_type(start_vertex_list, "start_vertex_list")
+    assert_CAI_type(starting_vertex_label_offsets, "starting_vertex_label_offsets", True)
+
+    assert_AI_type(h_fan_out, "h_fan_out")
+
+    if starting_vertex_label_offsets is not None:
+        if starting_vertex_label_offsets[-1] != len(start_vertex_list):
+            raise ValueError(
+                "'starting_vertex_label_offsets' and 'start_vertex_list' must be proportional")
+
+
+    ai_fan_out_ptr = \
+        h_fan_out.__array_interface__["data"][0]
+
+    fan_out_ptr = \
+        cugraph_type_erased_host_array_view_create(
+            <void*>ai_fan_out_ptr,
+            len(h_fan_out),
+            get_c_type_from_numpy_type(h_fan_out.dtype))
+
+
+
+    cdef cugraph_sample_result_t* result_ptr
+
+    cdef uintptr_t cai_start_ptr = \
+        start_vertex_list.__cuda_array_interface__["data"][0]
+
+    cdef uintptr_t cai_starting_vertex_label_offsets_ptr
+    if starting_vertex_label_offsets is not None:
+        cai_starting_vertex_label_offsets_ptr = \
+            starting_vertex_label_offsets.__cuda_array_interface__['data'][0]
+
+
+    cdef cugraph_type_erased_device_array_view_t* start_vertex_list_ptr = \
+        cugraph_type_erased_device_array_view_create(
+            <void*>cai_start_ptr,
+            len(start_vertex_list),
+            get_c_type_from_numpy_type(start_vertex_list.dtype))
+
+
+    cdef cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if starting_vertex_label_offsets is not None:
+        starting_vertex_label_offsets_ptr = \
+            cugraph_type_erased_device_array_view_create(
+                <void*>cai_starting_vertex_label_offsets_ptr,
+                len(starting_vertex_label_offsets),
+                SIZE_T
+            )
+
+    cdef cugraph_type_erased_device_array_view_t* label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if retain_seeds:
+        if starting_vertex_label_offsets is None:
+            raise ValueError("Must provide label offsets if retain_seeds is True")
+
+    cg_rng_state = CuGraphRandomState(resource_handle, random_state)
+
+    cdef cugraph_rng_state_t* rng_state_ptr = \
+        cg_rng_state.rng_state_ptr
+
+    cdef cugraph_prior_sources_behavior_t prior_sources_behavior_e
+    if prior_sources_behavior is None:
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.DEFAULT
+    elif prior_sources_behavior == 'carryover':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.CARRY_OVER
+    elif prior_sources_behavior == 'exclude':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.EXCLUDE
+    else:
+        raise ValueError(
+            f'Invalid option {prior_sources_behavior}'
+            ' for prior sources behavior'
+        )
+
+    cdef cugraph_compression_type_t compression_behavior_e
+    if compression is None or compression == 'COO':
+        compression_behavior_e = cugraph_compression_type_t.COO
+    elif compression == 'CSR':
+        compression_behavior_e = cugraph_compression_type_t.CSR
+    elif compression == 'CSC':
+        compression_behavior_e = cugraph_compression_type_t.CSC
+    elif compression == 'DCSR':
+        compression_behavior_e = cugraph_compression_type_t.DCSR
+    elif compression == 'DCSC':
+        compression_behavior_e = cugraph_compression_type_t.DCSC
+    else:
+        raise ValueError(
+            f'Invalid option {compression}'
+            ' for compression type'
+        )
+
+    cdef cugraph_sampling_options_t* sampling_options
+    error_code = cugraph_sampling_options_create(&sampling_options, &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_sampling_options_create")
+
+    cugraph_sampling_set_with_replacement(sampling_options, with_replacement)
+    cugraph_sampling_set_return_hops(sampling_options, c_return_hops)
+    cugraph_sampling_set_dedupe_sources(sampling_options, c_deduplicate_sources)
+    cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior_e)
+    cugraph_sampling_set_renumber_results(sampling_options, c_renumber)
+    cugraph_sampling_set_compression_type(sampling_options, compression_behavior_e)
+    cugraph_sampling_set_compress_per_hop(sampling_options, c_compress_per_hop)
+    cugraph_sampling_set_retain_seeds(sampling_options, retain_seeds)
+
+    error_code = cugraph_heterogeneous_biased_neighbor_sample(
+        c_resource_handle_ptr,
+        rng_state_ptr,
+        c_graph_ptr,
+        <cugraph_edge_property_view_t*>NULL, # FIXME: Add support for biased neighbor sampling
+        start_vertex_list_ptr,
+        starting_vertex_label_offsets_ptr,
+        fan_out_ptr,
+        num_edge_types,
+        sampling_options,
+        do_expensive_check,
+        &result_ptr,
+        &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_heterogeneous_biased_neighbor_sample")
+
+    # Free the sampling options
+    cugraph_sampling_options_free(sampling_options)
+
+    # Free the two input arrays that are no longer needed.
+    cugraph_type_erased_device_array_view_free(start_vertex_list_ptr)
+    cugraph_type_erased_host_array_view_free(fan_out_ptr)
+
+    if starting_vertex_label_offsets is not None:
+        cugraph_type_erased_device_array_view_free(starting_vertex_label_offsets_ptr)
+
+    # Have the SamplingResult instance assume ownership of the result data.
+    result = SamplingResult()
+    result.set_ptr(result_ptr)
+
+    # Get cupy "views" of the individual arrays to return. These each increment
+    # the refcount on the SamplingResult instance which will keep the data alive
+    # until all references are removed and the GC runs.
+
+    cupy_majors = result.get_majors()
+    cupy_major_offsets = result.get_major_offsets()
+    cupy_minors = result.get_minors()
+    cupy_edge_weights = result.get_edge_weights()
+    cupy_edge_ids = result.get_edge_ids()
+    cupy_edge_types = result.get_edge_types()
+    cupy_batch_ids = result.get_batch_ids()
+    cupy_label_hop_offsets = result.get_label_hop_offsets()
+
+
+    if renumber:
+        cupy_renumber_map = result.get_renumber_map()
+        cupy_renumber_map_offsets = result.get_renumber_map_offsets()
+        cupy_edge_renumber_map = result.get_edge_renumber_map()
+        cupy_edge_renumber_map_offsets = result.get_edge_renumber_map_offsets()
+
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+            'hop_id': None,
+            'renumber_map': cupy_renumber_map,
+            'renumber_map_offsets': cupy_renumber_map_offsets,
+            'edge_renumber_map' : cupy_edge_renumber_map,
+            'edge_renumber_map_offsets' : cupy_edge_renumber_map_offsets
+        }
+
+    else:
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+        }
+
+    # Return everything that isn't null
+    return {k: v for k, v in sampling_results.items() if v is not None}
diff --git a/python/pylibcugraph/pylibcugraph/heterogeneous_uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/heterogeneous_uniform_neighbor_sample.pyx
new file mode 100644
index 00000000000..3fa3575e27d
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/heterogeneous_uniform_neighbor_sample.pyx
@@ -0,0 +1,419 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from pylibcugraph._cugraph_c.types cimport (
+    bool_t,
+    SIZE_T
+)
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_create,
+    cugraph_type_erased_device_array_view_free,
+    cugraph_type_erased_host_array_view_t,
+    cugraph_type_erased_host_array_view_create,
+    cugraph_type_erased_host_array_view_free,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.algorithms cimport (
+    cugraph_sample_result_t,
+    cugraph_prior_sources_behavior_t,
+    cugraph_compression_type_t,
+    cugraph_sampling_options_t,
+    cugraph_sampling_options_create,
+    cugraph_sampling_options_free,
+    cugraph_sampling_set_with_replacement,
+    cugraph_sampling_set_return_hops,
+    cugraph_sampling_set_prior_sources_behavior,
+    cugraph_sampling_set_dedupe_sources,
+    cugraph_sampling_set_renumber_results,
+    cugraph_sampling_set_compress_per_hop,
+    cugraph_sampling_set_compression_type,
+    cugraph_sampling_set_retain_seeds,
+)
+from pylibcugraph._cugraph_c.sampling_algorithms cimport (
+    cugraph_heterogeneous_uniform_neighbor_sample,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    assert_CAI_type,
+    assert_AI_type,
+    get_c_type_from_numpy_type,
+)
+from pylibcugraph.internal_types.sampling_result cimport (
+    SamplingResult,
+)
+from pylibcugraph._cugraph_c.random cimport (
+    cugraph_rng_state_t
+)
+from pylibcugraph.random cimport (
+    CuGraphRandomState
+)
+import warnings
+
+# TODO accept cupy/numpy random state in addition to raw seed.
+def heterogeneous_uniform_neighbor_sample(ResourceHandle resource_handle,
+                                          _GPUGraph input_graph,
+                                          start_vertex_list,
+                                          starting_vertex_label_offsets,
+                                          h_fan_out,
+                                          num_edge_types,
+                                          bool_t with_replacement,
+                                          bool_t do_expensive_check,
+                                          prior_sources_behavior=None,
+                                          deduplicate_sources=False,
+                                          return_hops=False,
+                                          renumber=False,
+                                          retain_seeds=False,
+                                          compression='COO',
+                                          compress_per_hop=False,
+                                          random_state=None):
+    """
+    Performs uniform neighborhood sampling, which samples nodes from
+    a graph based on the current node's neighbors, with a corresponding fan_out
+    value at each hop. The edges are sampled uniformly. Heterogeneous
+    neighborhood sampling translates to more than 1 edge types.
+
+    Parameters
+    ----------
+    resource_handle: ResourceHandle
+        Handle to the underlying device and host resources needed for
+        referencing data and running algorithms.
+
+    input_graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    start_vertex_list: device array type
+        Device array containing the list of starting vertices for sampling.
+
+    starting_vertex_label_offsets: device array type (Optional)
+        Offsets of each label within the start vertex list. Expanding
+        'starting_vertex_label_offsets' must lead to an array of
+        len(start_vertex_list)
+
+    h_fan_out: numpy array type
+        Device array containing the branching out (fan-out) degrees per
+        starting vertex for each hop level. The fanout value at each hop for each
+        edge type is given by the relationship
+        h_fanout[x*num_edge_types + edge_type_id]
+
+        The sampling method can use different fan_out values for each edge type
+        which is not the case for homogeneous neighborhood sampling (both biased
+        and uniform).
+
+    num_edge_types: int
+        Number of edge types where a value of 1 translates to homogeneous neighbor
+        sample whereas a value greater than 1 translates to heterogeneous neighbor
+        sample.
+
+    with_replacement: bool
+        If true, sampling procedure is done with replacement (the same vertex
+        can be selected multiple times in the same step).
+
+    do_expensive_check: bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    prior_sources_behavior: str (Optional)
+        Options are "carryover", and "exclude".
+        Default will leave the source list as-is.
+        Carryover will carry over sources from previous hops to the
+        current hop.
+        Exclude will exclude sources from previous hops from reappearing
+        as sources in future hops.
+
+    deduplicate_sources: bool (Optional)
+        If True, will deduplicate the source list before sampling.
+        Defaults to False.
+
+    renumber: bool (Optional)
+        If True, will renumber the sources and destinations on a
+        per-batch basis and return the renumber map and batch offsets
+        in additional to the standard returns.
+
+    retain_seeds: bool (Optional)
+        If True, will retain the original seeds (original source vertices)
+        in the output even if they do not have outgoing neighbors.
+        Defaults to False.
+
+    compression: str (Optional)
+        Options: COO (default), CSR, CSC, DCSR, DCSR
+        Sets the compression format for the returned samples.
+
+    compress_per_hop: bool (Optional)
+        If False (default), will create a compressed edgelist for the
+        entire batch.
+        If True, will create a separate compressed edgelist per hop within
+        a batch.
+
+    random_state: int (Optional)
+        Random state to use when generating samples.  Optional argument,
+        defaults to a hash of process id, time, and hostname.
+        (See pylibcugraph.random.CuGraphRandomState)
+
+    Returns
+    -------
+    A tuple of device arrays, where the first and second items in the tuple
+    are device arrays containing the starting and ending vertices of each
+    walk respectively, the third item in the tuple is a device array
+    containing the start labels, and the fourth item in the tuple is a device
+    array containing the indices for reconstructing paths.
+
+    If renumber was set to True, then the fifth item in the tuple is a device
+    array containing the renumber map, and the sixth item in the tuple is a
+    device array containing the renumber map offsets (which delineate where
+    the renumber map for each batch starts).
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5],
+    ...                     dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4],
+    ...                     dtype=numpy.int32)
+    >>> weights = cupy.asarray([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2, 0.1, 2.1,
+    ...                         1.1, 5.1, 3.1,  4.1, 7.2, 3.2], dtype=numpy.float32)
+    >>> edge_types = cupy.asarray([0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1],
+    ...                            dtype=numpy.int32)
+    >>> start_vertices = cupy.asarray([2, 5, 1]).astype(numpy.int32)
+    >>> num_edge_types = 2
+    >>> starting_vertex_label_offsets = cupy.asarray([0, 2, 3])
+    >>> h_fan_out = numpy.array([2]).astype(numpy.int32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=False, renumber=False, do_expensive_check=False)
+    >>> sampling_results = pylibcugraph.heterogeneous_uniform_neighbor_sample(
+    ...         resource_handle, G, start_vertices, starting_vertex_label_offsets,
+    ...         h_fan_out, num_edge_types, False, True)
+    >>> sampling_results
+    {'majors': array([2, 2, 2, 5, 5, 1, 1, 1, 1], dtype=int32),
+     'minors': array([0, 1, 3, 3, 4, 0, 2, 3, 4], dtype=int32),
+     'weight': array([5.1, 3.1, 4.1, 7.2, 3.2, 0.1, 3.1, 2.1, 1.1], dtype=float32)}
+
+    """
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = (
+        resource_handle.c_resource_handle_ptr
+    )
+
+    cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr
+    cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = <cugraph_type_erased_host_array_view_t*>NULL
+
+    cdef bool_t c_deduplicate_sources = deduplicate_sources
+    cdef bool_t c_return_hops = return_hops
+    cdef bool_t c_renumber = renumber
+    cdef bool_t c_compress_per_hop = compress_per_hop
+
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+    cdef uintptr_t ai_fan_out_ptr
+
+    # FIXME: refactor the way we are creating pointer. Can use a single helper function to create
+
+    assert_CAI_type(start_vertex_list, "start_vertex_list")
+    assert_CAI_type(starting_vertex_label_offsets, "starting_vertex_label_offsets", True)
+
+    assert_AI_type(h_fan_out, "h_fan_out")
+
+    if starting_vertex_label_offsets is not None:
+        if starting_vertex_label_offsets[-1] != len(start_vertex_list):
+            raise ValueError(
+                "'starting_vertex_label_offsets' and 'start_vertex_list' must be proportional")
+
+    ai_fan_out_ptr = \
+        h_fan_out.__array_interface__["data"][0]
+
+    fan_out_ptr = \
+        cugraph_type_erased_host_array_view_create(
+            <void*>ai_fan_out_ptr,
+            len(h_fan_out),
+            get_c_type_from_numpy_type(h_fan_out.dtype))
+
+
+
+    cdef cugraph_sample_result_t* result_ptr
+
+    cdef uintptr_t cai_start_ptr = \
+        start_vertex_list.__cuda_array_interface__["data"][0]
+
+    cdef uintptr_t cai_starting_vertex_label_offsets_ptr
+    if starting_vertex_label_offsets is not None:
+        cai_starting_vertex_label_offsets_ptr = \
+            starting_vertex_label_offsets.__cuda_array_interface__['data'][0]
+
+
+    cdef cugraph_type_erased_device_array_view_t* start_vertex_list_ptr = \
+        cugraph_type_erased_device_array_view_create(
+            <void*>cai_start_ptr,
+            len(start_vertex_list),
+            get_c_type_from_numpy_type(start_vertex_list.dtype))
+
+
+    cdef cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if starting_vertex_label_offsets is not None:
+        starting_vertex_label_offsets_ptr = \
+            cugraph_type_erased_device_array_view_create(
+                <void*>cai_starting_vertex_label_offsets_ptr,
+                len(starting_vertex_label_offsets),
+                SIZE_T
+            )
+
+    cdef cugraph_type_erased_device_array_view_t* label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if retain_seeds:
+        if starting_vertex_label_offsets is None:
+            raise ValueError("Must provide label offsets if retain_seeds is True")
+
+    cg_rng_state = CuGraphRandomState(resource_handle, random_state)
+
+    cdef cugraph_rng_state_t* rng_state_ptr = \
+        cg_rng_state.rng_state_ptr
+
+    cdef cugraph_prior_sources_behavior_t prior_sources_behavior_e
+    if prior_sources_behavior is None:
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.DEFAULT
+    elif prior_sources_behavior == 'carryover':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.CARRY_OVER
+    elif prior_sources_behavior == 'exclude':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.EXCLUDE
+    else:
+        raise ValueError(
+            f'Invalid option {prior_sources_behavior}'
+            ' for prior sources behavior'
+        )
+
+    cdef cugraph_compression_type_t compression_behavior_e
+    if compression is None or compression == 'COO':
+        compression_behavior_e = cugraph_compression_type_t.COO
+    elif compression == 'CSR':
+        compression_behavior_e = cugraph_compression_type_t.CSR
+    elif compression == 'CSC':
+        compression_behavior_e = cugraph_compression_type_t.CSC
+    elif compression == 'DCSR':
+        compression_behavior_e = cugraph_compression_type_t.DCSR
+    elif compression == 'DCSC':
+        compression_behavior_e = cugraph_compression_type_t.DCSC
+    else:
+        raise ValueError(
+            f'Invalid option {compression}'
+            ' for compression type'
+        )
+
+    cdef cugraph_sampling_options_t* sampling_options
+    error_code = cugraph_sampling_options_create(&sampling_options, &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_sampling_options_create")
+
+    cugraph_sampling_set_with_replacement(sampling_options, with_replacement)
+    cugraph_sampling_set_return_hops(sampling_options, c_return_hops)
+    cugraph_sampling_set_dedupe_sources(sampling_options, c_deduplicate_sources)
+    cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior_e)
+    cugraph_sampling_set_renumber_results(sampling_options, c_renumber)
+    cugraph_sampling_set_compression_type(sampling_options, compression_behavior_e)
+    cugraph_sampling_set_compress_per_hop(sampling_options, c_compress_per_hop)
+    cugraph_sampling_set_retain_seeds(sampling_options, retain_seeds)
+
+    error_code = cugraph_heterogeneous_uniform_neighbor_sample(
+        c_resource_handle_ptr,
+        rng_state_ptr,
+        c_graph_ptr,
+        start_vertex_list_ptr,
+        starting_vertex_label_offsets_ptr,
+        fan_out_ptr,
+        num_edge_types,
+        sampling_options,
+        do_expensive_check,
+        &result_ptr,
+        &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_heterogeneous_uniform_neighbor_sample")
+
+    # Free the sampling options
+    cugraph_sampling_options_free(sampling_options)
+
+    # Free the two input arrays that are no longer needed.
+    cugraph_type_erased_device_array_view_free(start_vertex_list_ptr)
+    cugraph_type_erased_host_array_view_free(fan_out_ptr)
+
+    if starting_vertex_label_offsets is not None:
+        cugraph_type_erased_device_array_view_free(starting_vertex_label_offsets_ptr)
+
+    # Have the SamplingResult instance assume ownership of the result data.
+    result = SamplingResult()
+    result.set_ptr(result_ptr)
+
+    # Get cupy "views" of the individual arrays to return. These each increment
+    # the refcount on the SamplingResult instance which will keep the data alive
+    # until all references are removed and the GC runs.
+    cupy_majors = result.get_majors()
+    cupy_major_offsets = result.get_major_offsets()
+    cupy_minors = result.get_minors()
+    cupy_edge_weights = result.get_edge_weights()
+    cupy_edge_ids = result.get_edge_ids()
+    cupy_edge_types = result.get_edge_types()
+    cupy_batch_ids = result.get_batch_ids()
+    cupy_label_hop_offsets = result.get_label_hop_offsets()
+
+    if renumber:
+        cupy_renumber_map = result.get_renumber_map()
+        cupy_renumber_map_offsets = result.get_renumber_map_offsets()
+        cupy_edge_renumber_map = result.get_edge_renumber_map()
+        cupy_edge_renumber_map_offsets = result.get_edge_renumber_map_offsets()
+
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+            'hop_id': None,
+            'renumber_map': cupy_renumber_map,
+            'renumber_map_offsets': cupy_renumber_map_offsets,
+            'edge_renumber_map' : cupy_edge_renumber_map,
+            'edge_renumber_map_offsets' : cupy_edge_renumber_map_offsets
+        }
+
+    else:
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+        }
+
+    # Return everything that isn't null
+    return {k: v for k, v in sampling_results.items() if v is not None}
diff --git a/python/pylibcugraph/pylibcugraph/homogeneous_biased_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/homogeneous_biased_neighbor_sample.pyx
new file mode 100644
index 00000000000..e2476de1607
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/homogeneous_biased_neighbor_sample.pyx
@@ -0,0 +1,418 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from pylibcugraph._cugraph_c.types cimport (
+    bool_t,
+    SIZE_T
+)
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.properties cimport (
+    cugraph_edge_property_view_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_create,
+    cugraph_type_erased_device_array_view_free,
+    cugraph_type_erased_host_array_view_t,
+    cugraph_type_erased_host_array_view_create,
+    cugraph_type_erased_host_array_view_free,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.algorithms cimport (
+    cugraph_sample_result_t,
+    cugraph_prior_sources_behavior_t,
+    cugraph_compression_type_t,
+    cugraph_sampling_options_t,
+    cugraph_sampling_options_create,
+    cugraph_sampling_options_free,
+    cugraph_sampling_set_with_replacement,
+    cugraph_sampling_set_return_hops,
+    cugraph_sampling_set_prior_sources_behavior,
+    cugraph_sampling_set_dedupe_sources,
+    cugraph_sampling_set_renumber_results,
+    cugraph_sampling_set_compress_per_hop,
+    cugraph_sampling_set_compression_type,
+    cugraph_sampling_set_retain_seeds,
+)
+from pylibcugraph._cugraph_c.sampling_algorithms cimport (
+    cugraph_homogeneous_biased_neighbor_sample,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    assert_CAI_type,
+    assert_AI_type,
+    get_c_type_from_numpy_type,
+)
+from pylibcugraph.internal_types.sampling_result cimport (
+    SamplingResult,
+)
+from pylibcugraph._cugraph_c.random cimport (
+    cugraph_rng_state_t
+)
+from pylibcugraph.random cimport (
+    CuGraphRandomState
+)
+import warnings
+
+# TODO accept cupy/numpy random state in addition to raw seed.
+def homogeneous_biased_neighbor_sample(ResourceHandle resource_handle,
+                                       _GPUGraph input_graph,
+                                       start_vertex_list,
+                                       starting_vertex_label_offsets,
+                                       h_fan_out,
+                                       bool_t with_replacement,
+                                       bool_t do_expensive_check,
+                                       prior_sources_behavior=None,
+                                       deduplicate_sources=False,
+                                       return_hops=False,
+                                       renumber=False,
+                                       retain_seeds=False,
+                                       compression='COO',
+                                       compress_per_hop=False,
+                                       random_state=None):
+    """
+    Performs biased neighborhood sampling, which samples nodes from
+    a graph based on the current node's neighbors, with a corresponding fan_out
+    value at each hop. The edges are sampled with biases. Homogeneous
+    neighborhood sampling translates to 1 edge type.
+
+    Parameters
+    ----------
+    resource_handle: ResourceHandle
+        Handle to the underlying device and host resources needed for
+        referencing data and running algorithms.
+
+    input_graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    edge_biases: not supported.
+
+    start_vertex_list: device array type
+        Device array containing the list of starting vertices for sampling.
+
+    starting_vertex_label_offsets: device array type (Optional)
+        Offsets of each label within the start vertex list. Expanding
+        'starting_vertex_label_offsets' must lead to an array of
+        len(start_vertex_list)
+
+    h_fan_out: tuple of numpy array type
+        Device array containing the branching out (fan-out) degrees per
+        starting vertex for each hop level
+
+        The sampling method can use different fan_out values for each edge type
+        which is not the case for homogeneous neighborhood sampling (both biased
+        and uniform).
+
+    with_replacement: bool
+        If true, sampling procedure is done with replacement (the same vertex
+        can be selected multiple times in the same step).
+
+    do_expensive_check: bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    prior_sources_behavior: str (Optional)
+        Options are "carryover", and "exclude".
+        Default will leave the source list as-is.
+        Carryover will carry over sources from previous hops to the
+        current hop.
+        Exclude will exclude sources from previous hops from reappearing
+        as sources in future hops.
+
+    deduplicate_sources: bool (Optional)
+        If True, will deduplicate the source list before sampling.
+        Defaults to False.
+
+    renumber: bool (Optional)
+        If True, will renumber the sources and destinations on a
+        per-batch basis and return the renumber map and batch offsets
+        in additional to the standard returns.
+
+    retain_seeds: bool (Optional)
+        If True, will retain the original seeds (original source vertices)
+        in the output even if they do not have outgoing neighbors.
+        Defaults to False.
+
+    compression: str (Optional)
+        Options: COO (default), CSR, CSC, DCSR, DCSR
+        Sets the compression format for the returned samples.
+
+    compress_per_hop: bool (Optional)
+        If False (default), will create a compressed edgelist for the
+        entire batch.
+        If True, will create a separate compressed edgelist per hop within
+        a batch.
+
+    random_state: int (Optional)
+        Random state to use when generating samples.  Optional argument,
+        defaults to a hash of process id, time, and hostname.
+        (See pylibcugraph.random.CuGraphRandomState)
+
+    Returns
+    -------
+    A tuple of device arrays, where the first and second items in the tuple
+    are device arrays containing the starting and ending vertices of each
+    walk respectively, the third item in the tuple is a device array
+    containing the start labels, and the fourth item in the tuple is a device
+    array containing the indices for reconstructing paths.
+
+    If renumber was set to True, then the fifth item in the tuple is a device
+    array containing the renumber map, and the sixth item in the tuple is a
+    device array containing the renumber map offsets (which delineate where
+    the renumber map for each batch starts).
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4], dtype=numpy.int32)
+    >>> weights = cupy.asarray([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2, 0.1, 2.1, 1.1, 5.1, 3.1,
+    ...                         4.1, 7.2, 3.2], dtype=numpy.float32)
+    >>> start_vertices = cupy.asarray([2, 5]).astype(numpy.int32)
+    >>> h_fan_out = numpy.array([2]).astype(numpy.int32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=True, renumber=False, do_expensive_check=False)
+    >>> sampling_results = pylibcugraph.homogeneous_biased_neighbor_sample(
+    ...         resource_handle, G, start_vertices, None, h_fan_out, False, True)
+    >>> sampling_results
+    {'sources': array([2, 2, 5, 5], dtype=int32),
+     'destinations': array([1, 3, 3, 4], dtype=int32),
+     'indices': array([3.1, 4.1, 7.2, 3.2], dtype=float32)}
+
+    >>> start_vertices = cupy.asarray([2, 5, 1]).astype(numpy.int32)
+    >>> starting_vertex_label_offsets = cupy.asarray([0, 2, 3])
+    >>> sampling_results = pylibcugraph.homogeneous_biased_neighbor_sample(
+    ...         resource_handle, G, start_vertices, starting_vertex_label_offsets,
+    ...         h_fan_out, False, True)
+    >>> >>> sampling_results
+    {'majors': array([2, 2, 5, 5, 1, 1], dtype=int32),
+     'minors': array([1, 3, 3, 4, 3, 4], dtype=int32),
+     'weight': array([3.1, 4.1, 7.2, 3.2, 2.1, 1.1], dtype=float32)}
+
+    """
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = (
+        resource_handle.c_resource_handle_ptr
+    )
+
+    cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr
+    cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = <cugraph_type_erased_host_array_view_t*>NULL
+
+    cdef bool_t c_deduplicate_sources = deduplicate_sources
+    cdef bool_t c_return_hops = return_hops
+    cdef bool_t c_renumber = renumber
+    cdef bool_t c_compress_per_hop = compress_per_hop
+
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+    cdef uintptr_t ai_fan_out_ptr
+
+    # FIXME: refactor the way we are creating pointer. Can use a single helper function to create
+
+    assert_CAI_type(start_vertex_list, "start_vertex_list")
+    assert_CAI_type(starting_vertex_label_offsets, "starting_vertex_label_offsets", True)
+
+    assert_AI_type(h_fan_out, "h_fan_out")
+
+    if starting_vertex_label_offsets is not None:
+        if starting_vertex_label_offsets[-1] != len(start_vertex_list):
+            raise ValueError(
+                "'starting_vertex_label_offsets' and 'start_vertex_list' must be proportional")
+
+    ai_fan_out_ptr = \
+        h_fan_out.__array_interface__["data"][0]
+
+    fan_out_ptr = \
+        cugraph_type_erased_host_array_view_create(
+            <void*>ai_fan_out_ptr,
+            len(h_fan_out),
+            get_c_type_from_numpy_type(h_fan_out.dtype))
+
+
+
+    cdef cugraph_sample_result_t* result_ptr
+
+    cdef uintptr_t cai_start_ptr = \
+        start_vertex_list.__cuda_array_interface__["data"][0]
+
+    cdef uintptr_t cai_starting_vertex_label_offsets_ptr
+    if starting_vertex_label_offsets is not None:
+        cai_starting_vertex_label_offsets_ptr = \
+            starting_vertex_label_offsets.__cuda_array_interface__['data'][0]
+
+
+    cdef cugraph_type_erased_device_array_view_t* start_vertex_list_ptr = \
+        cugraph_type_erased_device_array_view_create(
+            <void*>cai_start_ptr,
+            len(start_vertex_list),
+            get_c_type_from_numpy_type(start_vertex_list.dtype))
+
+
+    cdef cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if starting_vertex_label_offsets is not None:
+        starting_vertex_label_offsets_ptr = \
+            cugraph_type_erased_device_array_view_create(
+                <void*>cai_starting_vertex_label_offsets_ptr,
+                len(starting_vertex_label_offsets),
+                SIZE_T
+            )
+
+    cdef cugraph_type_erased_device_array_view_t* label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if retain_seeds:
+        if starting_vertex_label_offsets is None:
+            raise ValueError("Must provide label offsets if retain_seeds is True")
+
+    cg_rng_state = CuGraphRandomState(resource_handle, random_state)
+
+    cdef cugraph_rng_state_t* rng_state_ptr = \
+        cg_rng_state.rng_state_ptr
+
+    cdef cugraph_prior_sources_behavior_t prior_sources_behavior_e
+    if prior_sources_behavior is None:
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.DEFAULT
+    elif prior_sources_behavior == 'carryover':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.CARRY_OVER
+    elif prior_sources_behavior == 'exclude':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.EXCLUDE
+    else:
+        raise ValueError(
+            f'Invalid option {prior_sources_behavior}'
+            ' for prior sources behavior'
+        )
+
+    cdef cugraph_compression_type_t compression_behavior_e
+    if compression is None or compression == 'COO':
+        compression_behavior_e = cugraph_compression_type_t.COO
+    elif compression == 'CSR':
+        compression_behavior_e = cugraph_compression_type_t.CSR
+    elif compression == 'CSC':
+        compression_behavior_e = cugraph_compression_type_t.CSC
+    elif compression == 'DCSR':
+        compression_behavior_e = cugraph_compression_type_t.DCSR
+    elif compression == 'DCSC':
+        compression_behavior_e = cugraph_compression_type_t.DCSC
+    else:
+        raise ValueError(
+            f'Invalid option {compression}'
+            ' for compression type'
+        )
+
+    cdef cugraph_sampling_options_t* sampling_options
+    error_code = cugraph_sampling_options_create(&sampling_options, &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_sampling_options_create")
+
+    cugraph_sampling_set_with_replacement(sampling_options, with_replacement)
+    cugraph_sampling_set_return_hops(sampling_options, c_return_hops)
+    cugraph_sampling_set_dedupe_sources(sampling_options, c_deduplicate_sources)
+    cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior_e)
+    cugraph_sampling_set_renumber_results(sampling_options, c_renumber)
+    cugraph_sampling_set_compression_type(sampling_options, compression_behavior_e)
+    cugraph_sampling_set_compress_per_hop(sampling_options, c_compress_per_hop)
+    cugraph_sampling_set_retain_seeds(sampling_options, retain_seeds)
+
+    error_code = cugraph_homogeneous_biased_neighbor_sample(
+        c_resource_handle_ptr,
+        rng_state_ptr,
+        c_graph_ptr,
+        <cugraph_edge_property_view_t*>NULL, # FIXME: Add support for biased neighbor sampling
+        start_vertex_list_ptr,
+        starting_vertex_label_offsets_ptr,
+        fan_out_ptr,
+        sampling_options,
+        do_expensive_check,
+        &result_ptr,
+        &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_homogeneous_biased_neighbor_sample")
+
+    # Free the sampling options
+    cugraph_sampling_options_free(sampling_options)
+
+    # Free the two input arrays that are no longer needed.
+    cugraph_type_erased_device_array_view_free(start_vertex_list_ptr)
+    cugraph_type_erased_host_array_view_free(fan_out_ptr)
+
+    if starting_vertex_label_offsets is not None:
+        cugraph_type_erased_device_array_view_free(starting_vertex_label_offsets_ptr)
+
+    # Have the SamplingResult instance assume ownership of the result data.
+    result = SamplingResult()
+    result.set_ptr(result_ptr)
+
+    # Get cupy "views" of the individual arrays to return. These each increment
+    # the refcount on the SamplingResult instance which will keep the data alive
+    # until all references are removed and the GC runs.
+    cupy_majors = result.get_majors()
+    cupy_major_offsets = result.get_major_offsets()
+    cupy_minors = result.get_minors()
+    cupy_edge_weights = result.get_edge_weights()
+    cupy_edge_ids = result.get_edge_ids()
+    cupy_edge_types = result.get_edge_types()
+    cupy_batch_ids = result.get_batch_ids()
+    cupy_label_hop_offsets = result.get_label_hop_offsets()
+    if renumber:
+        cupy_renumber_map = result.get_renumber_map()
+        cupy_renumber_map_offsets = result.get_renumber_map_offsets()
+        cupy_edge_renumber_map = result.get_edge_renumber_map()
+        cupy_edge_renumber_map_offsets = result.get_edge_renumber_map_offsets()
+
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+            'hop_id': None,
+            'renumber_map': cupy_renumber_map,
+            'renumber_map_offsets': cupy_renumber_map_offsets,
+            'edge_renumber_map' : cupy_edge_renumber_map,
+            'edge_renumber_map_offsets' : cupy_edge_renumber_map_offsets
+        }
+
+    else:
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+        }
+
+    # Return everything that isn't null
+    return {k: v for k, v in sampling_results.items() if v is not None}
diff --git a/python/pylibcugraph/pylibcugraph/homogeneous_uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/homogeneous_uniform_neighbor_sample.pyx
new file mode 100644
index 00000000000..3c6cdf77420
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/homogeneous_uniform_neighbor_sample.pyx
@@ -0,0 +1,413 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from pylibcugraph._cugraph_c.types cimport (
+    bool_t,
+    SIZE_T
+)
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_create,
+    cugraph_type_erased_device_array_view_free,
+    cugraph_type_erased_host_array_view_t,
+    cugraph_type_erased_host_array_view_create,
+    cugraph_type_erased_host_array_view_free,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.algorithms cimport (
+    cugraph_sample_result_t,
+    cugraph_prior_sources_behavior_t,
+    cugraph_compression_type_t,
+    cugraph_sampling_options_t,
+    cugraph_sampling_options_create,
+    cugraph_sampling_options_free,
+    cugraph_sampling_set_with_replacement,
+    cugraph_sampling_set_return_hops,
+    cugraph_sampling_set_prior_sources_behavior,
+    cugraph_sampling_set_dedupe_sources,
+    cugraph_sampling_set_renumber_results,
+    cugraph_sampling_set_compress_per_hop,
+    cugraph_sampling_set_compression_type,
+    cugraph_sampling_set_retain_seeds,
+)
+from pylibcugraph._cugraph_c.sampling_algorithms cimport (
+    cugraph_homogeneous_uniform_neighbor_sample,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    assert_CAI_type,
+    assert_AI_type,
+    get_c_type_from_numpy_type,
+)
+from pylibcugraph.internal_types.sampling_result cimport (
+    SamplingResult,
+)
+from pylibcugraph._cugraph_c.random cimport (
+    cugraph_rng_state_t
+)
+from pylibcugraph.random cimport (
+    CuGraphRandomState
+)
+import warnings
+
+# TODO accept cupy/numpy random state in addition to raw seed.
+def homogeneous_uniform_neighbor_sample(ResourceHandle resource_handle,
+                                          _GPUGraph input_graph,
+                                          start_vertex_list,
+                                          starting_vertex_label_offsets,
+                                          h_fan_out,
+                                          bool_t with_replacement,
+                                          bool_t do_expensive_check,
+                                          prior_sources_behavior=None,
+                                          deduplicate_sources=False,
+                                          return_hops=False,
+                                          renumber=False,
+                                          retain_seeds=False,
+                                          compression='COO',
+                                          compress_per_hop=False,
+                                          random_state=None):
+    """
+    Performs biased neighborhood sampling, which samples nodes from
+    a graph based on the current node's neighbors, with a corresponding fan_out
+    value at each hop. The edges are sampled uniformly. Homogeneous
+    neighborhood sampling translates to 1 edge type.
+
+    Parameters
+    ----------
+    resource_handle: ResourceHandle
+        Handle to the underlying device and host resources needed for
+        referencing data and running algorithms.
+
+    input_graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    start_vertex_list: device array type
+        Device array containing the list of starting vertices for sampling.
+
+    starting_vertex_label_offsets: device array type (Optional)
+        Offsets of each label within the start vertex list. Expanding
+        'starting_vertex_label_offsets' must lead to an array of
+        len(start_vertex_list)
+
+    h_fan_out: numpy array type
+        Device array containing the branching out (fan-out) degrees per
+        starting vertex for each hop level
+
+        The sampling method can use different fan_out values for each edge type
+        which is not the case for homogeneous neighborhood sampling (both biased
+        and uniform).
+
+    with_replacement: bool
+        If true, sampling procedure is done with replacement (the same vertex
+        can be selected multiple times in the same step).
+
+    do_expensive_check: bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    prior_sources_behavior: str (Optional)
+        Options are "carryover", and "exclude".
+        Default will leave the source list as-is.
+        Carryover will carry over sources from previous hops to the
+        current hop.
+        Exclude will exclude sources from previous hops from reappearing
+        as sources in future hops.
+
+    deduplicate_sources: bool (Optional)
+        If True, will deduplicate the source list before sampling.
+        Defaults to False.
+
+    renumber: bool (Optional)
+        If True, will renumber the sources and destinations on a
+        per-batch basis and return the renumber map and batch offsets
+        in additional to the standard returns.
+
+    retain_seeds: bool (Optional)
+        If True, will retain the original seeds (original source vertices)
+        in the output even if they do not have outgoing neighbors.
+        Defaults to False.
+
+    compression: str (Optional)
+        Options: COO (default), CSR, CSC, DCSR, DCSR
+        Sets the compression format for the returned samples.
+
+    compress_per_hop: bool (Optional)
+        If False (default), will create a compressed edgelist for the
+        entire batch.
+        If True, will create a separate compressed edgelist per hop within
+        a batch.
+
+    random_state: int (Optional)
+        Random state to use when generating samples.  Optional argument,
+        defaults to a hash of process id, time, and hostname.
+        (See pylibcugraph.random.CuGraphRandomState)
+
+    Returns
+    -------
+    A tuple of device arrays, where the first and second items in the tuple
+    are device arrays containing the starting and ending vertices of each
+    walk respectively, the third item in the tuple is a device array
+    containing the start labels, and the fourth item in the tuple is a device
+    array containing the indices for reconstructing paths.
+
+    If renumber was set to True, then the fifth item in the tuple is a device
+    array containing the renumber map, and the sixth item in the tuple is a
+    device array containing the renumber map offsets (which delineate where
+    the renumber map for each batch starts).
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4], dtype=numpy.int32)
+    >>> weights = cupy.asarray([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2, 0.1, 2.1, 1.1, 5.1, 3.1,
+    ...                         4.1, 7.2, 3.2], dtype=numpy.float32)
+    >>> start_vertices = cupy.asarray([2, 5]).astype(numpy.int32)
+    >>> h_fan_out = numpy.array([2]).astype(numpy.int32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=True, renumber=False, do_expensive_check=False)
+    >>> sampling_results = pylibcugraph.homogeneous_uniform_neighbor_sample(
+    ...         resource_handle, G, start_vertices, None, h_fan_out, False, True)
+    >>> sampling_results
+    {'sources': array([2, 2, 5, 5], dtype=int32),
+     'destinations': array([1, 3, 3, 4], dtype=int32),
+     'indices': array([3.1, 4.1, 7.2, 3.2], dtype=float32)}
+
+    >>> start_vertices = cupy.asarray([2, 5, 1]).astype(numpy.int32)
+    >>> starting_vertex_label_offsets = cupy.asarray([0, 2, 3])
+    >>> sampling_results = pylibcugraph.homogeneous_uniform_neighbor_sample(
+    ...         resource_handle, G, start_vertices, starting_vertex_label_offsets,
+    ...         h_fan_out, False, True)
+    >>> >>> sampling_results
+    {'majors': array([2, 2, 5, 5, 1, 1], dtype=int32),
+     'minors': array([1, 3, 3, 4, 3, 4], dtype=int32),
+     'weight': array([3.1, 4.1, 7.2, 3.2, 2.1, 1.1], dtype=float32)}
+
+    """
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = (
+        resource_handle.c_resource_handle_ptr
+    )
+
+    cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr
+    cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = <cugraph_type_erased_host_array_view_t*>NULL
+
+    cdef bool_t c_deduplicate_sources = deduplicate_sources
+    cdef bool_t c_return_hops = return_hops
+    cdef bool_t c_renumber = renumber
+    cdef bool_t c_compress_per_hop = compress_per_hop
+
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+    cdef uintptr_t ai_fan_out_ptr
+
+    # FIXME: refactor the way we are creating pointer. Can use a single helper function to create
+
+    assert_CAI_type(start_vertex_list, "start_vertex_list")
+    assert_CAI_type(starting_vertex_label_offsets, "starting_vertex_label_offsets", True)
+
+    assert_AI_type(h_fan_out, "h_fan_out")
+
+    if starting_vertex_label_offsets is not None:
+        if starting_vertex_label_offsets[-1] != len(start_vertex_list):
+            raise ValueError(
+                "'starting_vertex_label_offsets' and 'start_vertex_list' must be proportional")
+
+    ai_fan_out_ptr = \
+        h_fan_out.__array_interface__["data"][0]
+
+    fan_out_ptr = \
+        cugraph_type_erased_host_array_view_create(
+            <void*>ai_fan_out_ptr,
+            len(h_fan_out),
+            get_c_type_from_numpy_type(h_fan_out.dtype))
+
+
+
+    cdef cugraph_sample_result_t* result_ptr
+
+    cdef uintptr_t cai_start_ptr = \
+        start_vertex_list.__cuda_array_interface__["data"][0]
+
+    cdef uintptr_t cai_starting_vertex_label_offsets_ptr
+    if starting_vertex_label_offsets is not None:
+        cai_starting_vertex_label_offsets_ptr = \
+            starting_vertex_label_offsets.__cuda_array_interface__['data'][0]
+
+
+    cdef cugraph_type_erased_device_array_view_t* start_vertex_list_ptr = \
+        cugraph_type_erased_device_array_view_create(
+            <void*>cai_start_ptr,
+            len(start_vertex_list),
+            get_c_type_from_numpy_type(start_vertex_list.dtype))
+
+
+    cdef cugraph_type_erased_device_array_view_t* starting_vertex_label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if starting_vertex_label_offsets is not None:
+        starting_vertex_label_offsets_ptr = \
+            cugraph_type_erased_device_array_view_create(
+                <void*>cai_starting_vertex_label_offsets_ptr,
+                len(starting_vertex_label_offsets),
+                SIZE_T
+            )
+
+    cdef cugraph_type_erased_device_array_view_t* label_offsets_ptr = <cugraph_type_erased_device_array_view_t*>NULL
+    if retain_seeds:
+        if starting_vertex_label_offsets is None:
+            raise ValueError("Must provide label offsets if retain_seeds is True")
+
+    cg_rng_state = CuGraphRandomState(resource_handle, random_state)
+
+    cdef cugraph_rng_state_t* rng_state_ptr = \
+        cg_rng_state.rng_state_ptr
+
+    cdef cugraph_prior_sources_behavior_t prior_sources_behavior_e
+    if prior_sources_behavior is None:
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.DEFAULT
+    elif prior_sources_behavior == 'carryover':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.CARRY_OVER
+    elif prior_sources_behavior == 'exclude':
+        prior_sources_behavior_e = cugraph_prior_sources_behavior_t.EXCLUDE
+    else:
+        raise ValueError(
+            f'Invalid option {prior_sources_behavior}'
+            ' for prior sources behavior'
+        )
+
+    cdef cugraph_compression_type_t compression_behavior_e
+    if compression is None or compression == 'COO':
+        compression_behavior_e = cugraph_compression_type_t.COO
+    elif compression == 'CSR':
+        compression_behavior_e = cugraph_compression_type_t.CSR
+    elif compression == 'CSC':
+        compression_behavior_e = cugraph_compression_type_t.CSC
+    elif compression == 'DCSR':
+        compression_behavior_e = cugraph_compression_type_t.DCSR
+    elif compression == 'DCSC':
+        compression_behavior_e = cugraph_compression_type_t.DCSC
+    else:
+        raise ValueError(
+            f'Invalid option {compression}'
+            ' for compression type'
+        )
+
+    cdef cugraph_sampling_options_t* sampling_options
+    error_code = cugraph_sampling_options_create(&sampling_options, &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_sampling_options_create")
+
+    cugraph_sampling_set_with_replacement(sampling_options, with_replacement)
+    cugraph_sampling_set_return_hops(sampling_options, c_return_hops)
+    cugraph_sampling_set_dedupe_sources(sampling_options, c_deduplicate_sources)
+    cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior_e)
+    cugraph_sampling_set_renumber_results(sampling_options, c_renumber)
+    cugraph_sampling_set_compression_type(sampling_options, compression_behavior_e)
+    cugraph_sampling_set_compress_per_hop(sampling_options, c_compress_per_hop)
+    cugraph_sampling_set_retain_seeds(sampling_options, retain_seeds)
+
+    error_code = cugraph_homogeneous_uniform_neighbor_sample(
+        c_resource_handle_ptr,
+        rng_state_ptr,
+        c_graph_ptr,
+        start_vertex_list_ptr,
+        starting_vertex_label_offsets_ptr,
+        fan_out_ptr,
+        sampling_options,
+        do_expensive_check,
+        &result_ptr,
+        &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_homogeneous_uniform_neighbor_sample")
+
+    # Free the sampling options
+    cugraph_sampling_options_free(sampling_options)
+
+    # Free the two input arrays that are no longer needed.
+    cugraph_type_erased_device_array_view_free(start_vertex_list_ptr)
+    cugraph_type_erased_host_array_view_free(fan_out_ptr)
+
+    if starting_vertex_label_offsets is not None:
+        cugraph_type_erased_device_array_view_free(starting_vertex_label_offsets_ptr)
+
+    # Have the SamplingResult instance assume ownership of the result data.
+    result = SamplingResult()
+    result.set_ptr(result_ptr)
+
+    # Get cupy "views" of the individual arrays to return. These each increment
+    # the refcount on the SamplingResult instance which will keep the data alive
+    # until all references are removed and the GC runs.
+    cupy_majors = result.get_majors()
+    cupy_major_offsets = result.get_major_offsets()
+    cupy_minors = result.get_minors()
+    cupy_edge_weights = result.get_edge_weights()
+    cupy_edge_ids = result.get_edge_ids()
+    cupy_edge_types = result.get_edge_types()
+    cupy_batch_ids = result.get_batch_ids()
+    cupy_label_hop_offsets = result.get_label_hop_offsets()
+
+    if renumber:
+        cupy_renumber_map = result.get_renumber_map()
+        cupy_renumber_map_offsets = result.get_renumber_map_offsets()
+        cupy_edge_renumber_map = result.get_edge_renumber_map()
+        cupy_edge_renumber_map_offsets = result.get_edge_renumber_map_offsets()
+
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+            'hop_id': None,
+            'renumber_map': cupy_renumber_map,
+            'renumber_map_offsets': cupy_renumber_map_offsets,
+            'edge_renumber_map' : cupy_edge_renumber_map,
+            'edge_renumber_map_offsets' : cupy_edge_renumber_map_offsets
+        }
+
+    else:
+        sampling_results = {
+            'major_offsets': cupy_major_offsets,
+            'majors': cupy_majors,
+            'minors': cupy_minors,
+            'weight': cupy_edge_weights,
+            'edge_id': cupy_edge_ids,
+            'edge_type': cupy_edge_types,
+            'batch_id': cupy_batch_ids,
+            'label_hop_offsets': cupy_label_hop_offsets,
+        }
+
+    # Return everything that isn't null
+    return {k: v for k, v in sampling_results.items() if v is not None}
diff --git a/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx b/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx
index f588237942b..b93618d73ce 100644
--- a/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx
+++ b/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx
@@ -34,6 +34,8 @@ from pylibcugraph._cugraph_c.algorithms cimport (
     cugraph_sample_result_get_offsets, # deprecated
     cugraph_sample_result_get_renumber_map,
     cugraph_sample_result_get_renumber_map_offsets,
+    cugraph_sample_result_get_edge_renumber_map,
+    cugraph_sample_result_get_edge_renumber_map_offsets,
     cugraph_sample_result_free,
 )
 from pylibcugraph.utils cimport (
@@ -257,3 +259,30 @@ cdef class SamplingResult:
 
         return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
                                                      self)
+
+
+    def get_edge_renumber_map(self):
+        if self.c_sample_result_ptr is NULL:
+            raise ValueError("pointer not set, must call set_ptr() with a "
+                             "non-NULL value first.")
+        cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
+            cugraph_sample_result_get_edge_renumber_map(self.c_sample_result_ptr)
+        )
+        if device_array_view_ptr is NULL:
+            return None
+
+        return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
+                                                     self)
+
+    def get_edge_renumber_map_offsets(self):
+        if self.c_sample_result_ptr is NULL:
+            raise ValueError("pointer not set, must call set_ptr() with a "
+                             "non-NULL value first.")
+        cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
+            cugraph_sample_result_get_edge_renumber_map_offsets(self.c_sample_result_ptr)
+        )
+        if device_array_view_ptr is NULL:
+            return None
+
+        return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
+                                                     self)