diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index e0e728640ca..69f2ad20de8 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,14 @@ struct has_nzd_t { } }; +// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an +// extended __device__ lambda must allow its address to be taken) +template +struct rebase_offset_t { + edge_t base_offset{}; + __device__ edge_t operator()(edge_t offset) const { return offset - base_offset; } +}; + // compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid template std::tuple, @@ -202,6 +211,145 @@ compress_edgelist(edgelist_t const& edgelist, std::move(offsets), std::move(indices), std::move(weights), std::move(dcs_nzd_vertices)); } +template +void sort_adjacency_list(raft::handle_t const& handle, + edge_t const* offsets, + vertex_t* indices /* [INOUT} */, + std::optional weights /* [INOUT] */, + vertex_t num_vertices, + edge_t num_edges) +{ + // FIXME: The current cub's segmented sort based implementation is slower than the global sort + // based approach, but we expect cub's segmented sort performance will get significantly better in + // few months. We also need to re-evaluate performance & memory overhead of presorting edge list + // and running thrust::reduce to update offset vs the current approach after updating the python + // interface. If we take r-values of rmm::device_uvector edge list, we can do indcies_ = + // std::move(minors) & weights_ = std::move (weights). This affects peak memory use and we may + // find the presorting approach more attractive under this scenario. + + // 1. We segmented sort edges in chunks, and we need to adjust chunk offsets as we need to sort + // each vertex's neighbors at once. + + // to limit memory footprint ((1 << 20) is a tuning parameter) + auto approx_edges_to_sort_per_iteration = + static_cast(handle.get_device_properties().multiProcessorCount) * (1 << 20); + auto search_offset_first = + thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}), + [approx_edges_to_sort_per_iteration] __device__(auto i) { + return i * approx_edges_to_sort_per_iteration; + }); + auto num_chunks = + (num_edges + approx_edges_to_sort_per_iteration - 1) / approx_edges_to_sort_per_iteration; + rmm::device_uvector d_vertex_offsets(num_chunks - 1, handle.get_stream()); + thrust::lower_bound(handle.get_thrust_policy(), + offsets, + offsets + num_vertices + 1, + search_offset_first, + search_offset_first + d_vertex_offsets.size(), + d_vertex_offsets.begin()); + rmm::device_uvector d_edge_offsets(d_vertex_offsets.size(), handle.get_stream()); + thrust::gather(handle.get_thrust_policy(), + d_vertex_offsets.begin(), + d_vertex_offsets.end(), + offsets, + d_edge_offsets.begin()); + std::vector h_edge_offsets(num_chunks + 1, edge_t{0}); + h_edge_offsets.back() = num_edges; + raft::update_host( + h_edge_offsets.data() + 1, d_edge_offsets.data(), d_edge_offsets.size(), handle.get_stream()); + std::vector h_vertex_offsets(num_chunks + 1, vertex_t{0}); + h_vertex_offsets.back() = num_vertices; + raft::update_host(h_vertex_offsets.data() + 1, + d_vertex_offsets.data(), + d_vertex_offsets.size(), + handle.get_stream()); + + // 2. Segmented sort each vertex's neighbors + + size_t max_chunk_size{0}; + for (size_t i = 0; i < num_chunks; ++i) { + max_chunk_size = + std::max(max_chunk_size, static_cast(h_edge_offsets[i + 1] - h_edge_offsets[i])); + } + rmm::device_uvector segment_sorted_indices(max_chunk_size, handle.get_stream()); + auto segment_sorted_weights = + weights ? std::make_optional>(max_chunk_size, handle.get_stream()) + : std::nullopt; + rmm::device_uvector d_temp_storage(0, handle.get_stream()); + for (size_t i = 0; i < num_chunks; ++i) { + size_t temp_storage_bytes{0}; + auto offset_first = thrust::make_transform_iterator(offsets + h_vertex_offsets[i], + rebase_offset_t{h_edge_offsets[i]}); + if (weights) { + cub::DeviceSegmentedRadixSort::SortPairs(static_cast(nullptr), + temp_storage_bytes, + indices + h_edge_offsets[i], + segment_sorted_indices.data(), + (*weights) + h_edge_offsets[i], + (*segment_sorted_weights).data(), + h_edge_offsets[i + 1] - h_edge_offsets[i], + h_vertex_offsets[i + 1] - h_vertex_offsets[i], + offset_first, + offset_first + 1, + 0, + sizeof(vertex_t) * 8, + handle.get_stream()); + } else { + cub::DeviceSegmentedRadixSort::SortKeys(static_cast(nullptr), + temp_storage_bytes, + indices + h_edge_offsets[i], + segment_sorted_indices.data(), + h_edge_offsets[i + 1] - h_edge_offsets[i], + h_vertex_offsets[i + 1] - h_vertex_offsets[i], + offset_first, + offset_first + 1, + 0, + sizeof(vertex_t) * 8, + handle.get_stream()); + } + if (temp_storage_bytes > d_temp_storage.size()) { + d_temp_storage = rmm::device_uvector(temp_storage_bytes, handle.get_stream()); + } + if (weights) { + cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage.data(), + temp_storage_bytes, + indices + h_edge_offsets[i], + segment_sorted_indices.data(), + (*weights) + h_edge_offsets[i], + (*segment_sorted_weights).data(), + h_edge_offsets[i + 1] - h_edge_offsets[i], + h_vertex_offsets[i + 1] - h_vertex_offsets[i], + offset_first, + offset_first + 1, + 0, + sizeof(vertex_t) * 8, + handle.get_stream()); + } else { + cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage.data(), + temp_storage_bytes, + indices + h_edge_offsets[i], + segment_sorted_indices.data(), + h_edge_offsets[i + 1] - h_edge_offsets[i], + h_vertex_offsets[i + 1] - h_vertex_offsets[i], + offset_first, + offset_first + 1, + 0, + sizeof(vertex_t) * 8, + handle.get_stream()); + } + thrust::copy(handle.get_thrust_policy(), + segment_sorted_indices.begin(), + segment_sorted_indices.begin() + (h_edge_offsets[i + 1] - h_edge_offsets[i]), + indices + h_edge_offsets[i]); + if (weights) { + thrust::copy(handle.get_thrust_policy(), + (*segment_sorted_weights).begin(), + (*segment_sorted_weights).begin() + (h_edge_offsets[i + 1] - h_edge_offsets[i]), + (*weights) + h_edge_offsets[i]); + } + } +} + } // namespace template {(*adj_matrix_partition_weights_)[i].data()} + : std::nullopt, + static_cast(adj_matrix_partition_offsets_[i].size() - 1), + static_cast(adj_matrix_partition_indices_[i].size())); + } + // if # unique edge rows/cols << V / row_comm_size|col_comm_size, store unique edge rows/cols to // support storing edge row/column properties in (key, value) pairs. @@ -645,6 +806,15 @@ graph_tget_number_of_vertices(), default_stream_view); + // segmented sort neighbors + + sort_adjacency_list(handle, + offsets_.data(), + indices_.data(), + weights_ ? std::optional{(*weights_).data()} : std::nullopt, + static_cast(offsets_.size() - 1), + static_cast(indices_.size())); + // optional expensive checks (part 3/3) if (do_expensive_check) {