Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sort local neighbors in the graph adjacency list. #1886

Merged
merged 3 commits into from
Oct 18, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 170 additions & 0 deletions cpp/src/structure/graph_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <thrust/binary_search.h>
#include <thrust/fill.h>
#include <thrust/for_each.h>
#include <thrust/gather.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/tuple.h>
Expand Down Expand Up @@ -73,6 +74,14 @@ struct has_nzd_t {
}
};

// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an
// extended __device__ lambda must allow its address to be taken)
template <typename edge_t>
struct rebase_offset_t {
edge_t base_offset{};
__device__ edge_t operator()(edge_t offset) const { return offset - base_offset; }
};

// compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid
template <bool store_transposed, typename vertex_t, typename edge_t, typename weight_t>
std::tuple<rmm::device_uvector<edge_t>,
Expand Down Expand Up @@ -202,6 +211,145 @@ compress_edgelist(edgelist_t<vertex_t, edge_t, weight_t> const& edgelist,
std::move(offsets), std::move(indices), std::move(weights), std::move(dcs_nzd_vertices));
}

template <typename vertex_t, typename edge_t, typename weight_t>
void sort_adjacency_list(raft::handle_t const& handle,
edge_t const* offsets,
vertex_t* indices /* [INOUT} */,
std::optional<weight_t*> weights /* [INOUT] */,
vertex_t num_vertices,
edge_t num_edges)
{
// FIXME: The current cub's segmented sort based implementation is slower than the global sort
// based approach, but we expect cub's segmented sort performance will get significantly better in
// few months. We also need to re-evaluate performance & memory overhead of presorting edge list
// and running thrust::reduce to update offset vs the current approach after updating the python
// interface. If we take r-values of rmm::device_uvector edge list, we can do indcies_ =
// std::move(minors) & weights_ = std::move (weights). This affects peak memory use and we may
// find the presorting approach more attractive under this scenario.

// 1. We segmented sort edges in chunks, and we need to adjust chunk offsets as we need to sort
// each vertex's neighbors at once.

// to limit memory footprint ((1 << 20) is a tuning parameter)
auto approx_edges_to_sort_per_iteration =
static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 20);
auto search_offset_first =
thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}),
[approx_edges_to_sort_per_iteration] __device__(auto i) {
return i * approx_edges_to_sort_per_iteration;
});
auto num_chunks =
(num_edges + approx_edges_to_sort_per_iteration - 1) / approx_edges_to_sort_per_iteration;
rmm::device_uvector<vertex_t> d_vertex_offsets(num_chunks - 1, handle.get_stream());
thrust::lower_bound(handle.get_thrust_policy(),
offsets,
offsets + num_vertices + 1,
search_offset_first,
search_offset_first + d_vertex_offsets.size(),
d_vertex_offsets.begin());
rmm::device_uvector<edge_t> d_edge_offsets(d_vertex_offsets.size(), handle.get_stream());
thrust::gather(handle.get_thrust_policy(),
d_vertex_offsets.begin(),
d_vertex_offsets.end(),
offsets,
d_edge_offsets.begin());
std::vector<edge_t> h_edge_offsets(num_chunks + 1, edge_t{0});
h_edge_offsets.back() = num_edges;
raft::update_host(
h_edge_offsets.data() + 1, d_edge_offsets.data(), d_edge_offsets.size(), handle.get_stream());
std::vector<vertex_t> h_vertex_offsets(num_chunks + 1, vertex_t{0});
h_vertex_offsets.back() = num_vertices;
raft::update_host(h_vertex_offsets.data() + 1,
d_vertex_offsets.data(),
d_vertex_offsets.size(),
handle.get_stream());

// 2. Segmented sort each vertex's neighbors

size_t max_chunk_size{0};
for (size_t i = 0; i < num_chunks; ++i) {
max_chunk_size =
std::max(max_chunk_size, static_cast<size_t>(h_edge_offsets[i + 1] - h_edge_offsets[i]));
}
rmm::device_uvector<vertex_t> segment_sorted_indices(max_chunk_size, handle.get_stream());
auto segment_sorted_weights =
weights ? std::make_optional<rmm::device_uvector<weight_t>>(max_chunk_size, handle.get_stream())
: std::nullopt;
rmm::device_uvector<std::byte> d_temp_storage(0, handle.get_stream());
for (size_t i = 0; i < num_chunks; ++i) {
size_t temp_storage_bytes{0};
auto offset_first = thrust::make_transform_iterator(offsets + h_vertex_offsets[i],
rebase_offset_t<edge_t>{h_edge_offsets[i]});
if (weights) {
cub::DeviceSegmentedRadixSort::SortPairs(static_cast<void*>(nullptr),
temp_storage_bytes,
indices + h_edge_offsets[i],
segment_sorted_indices.data(),
(*weights) + h_edge_offsets[i],
(*segment_sorted_weights).data(),
h_edge_offsets[i + 1] - h_edge_offsets[i],
h_vertex_offsets[i + 1] - h_vertex_offsets[i],
offset_first,
offset_first + 1,
0,
sizeof(vertex_t) * 8,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sizeof(vertex_t) << 3 should be faster (although, most likely optimized out by the compiler, anyway)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes... and this will be executed only few times per graph creation, so even without optimization, this will increase execution time by only few nanoseconds, better go for clarity.

handle.get_stream());
} else {
cub::DeviceSegmentedRadixSort::SortKeys(static_cast<void*>(nullptr),
temp_storage_bytes,
indices + h_edge_offsets[i],
segment_sorted_indices.data(),
h_edge_offsets[i + 1] - h_edge_offsets[i],
h_vertex_offsets[i + 1] - h_vertex_offsets[i],
offset_first,
offset_first + 1,
0,
sizeof(vertex_t) * 8,
handle.get_stream());
}
if (temp_storage_bytes > d_temp_storage.size()) {
d_temp_storage = rmm::device_uvector<std::byte>(temp_storage_bytes, handle.get_stream());
}
if (weights) {
cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage.data(),
temp_storage_bytes,
indices + h_edge_offsets[i],
segment_sorted_indices.data(),
(*weights) + h_edge_offsets[i],
(*segment_sorted_weights).data(),
h_edge_offsets[i + 1] - h_edge_offsets[i],
h_vertex_offsets[i + 1] - h_vertex_offsets[i],
offset_first,
offset_first + 1,
0,
sizeof(vertex_t) * 8,
handle.get_stream());
} else {
cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage.data(),
temp_storage_bytes,
indices + h_edge_offsets[i],
segment_sorted_indices.data(),
h_edge_offsets[i + 1] - h_edge_offsets[i],
h_vertex_offsets[i + 1] - h_vertex_offsets[i],
offset_first,
offset_first + 1,
0,
sizeof(vertex_t) * 8,
handle.get_stream());
}
thrust::copy(handle.get_thrust_policy(),
segment_sorted_indices.begin(),
segment_sorted_indices.begin() + (h_edge_offsets[i + 1] - h_edge_offsets[i]),
indices + h_edge_offsets[i]);
if (weights) {
thrust::copy(handle.get_thrust_policy(),
(*segment_sorted_weights).begin(),
(*segment_sorted_weights).begin() + (h_edge_offsets[i + 1] - h_edge_offsets[i]),
(*weights) + h_edge_offsets[i]);
}
}
}

} // namespace

template <typename vertex_t,
Expand Down Expand Up @@ -406,6 +554,19 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
}
}

// segmented sort neighbors

for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) {
sort_adjacency_list(handle,
adj_matrix_partition_offsets_[i].data(),
adj_matrix_partition_indices_[i].data(),
adj_matrix_partition_weights_
? std::optional<weight_t*>{(*adj_matrix_partition_weights_)[i].data()}
: std::nullopt,
static_cast<vertex_t>(adj_matrix_partition_offsets_[i].size() - 1),
static_cast<edge_t>(adj_matrix_partition_indices_[i].size()));
}

// if # unique edge rows/cols << V / row_comm_size|col_comm_size, store unique edge rows/cols to
// support storing edge row/column properties in (key, value) pairs.

Expand Down Expand Up @@ -645,6 +806,15 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
this->get_number_of_vertices(),
default_stream_view);

// segmented sort neighbors

sort_adjacency_list(handle,
offsets_.data(),
indices_.data(),
weights_ ? std::optional<weight_t*>{(*weights_).data()} : std::nullopt,
static_cast<vertex_t>(offsets_.size() - 1),
static_cast<edge_t>(indices_.size()));

// optional expensive checks (part 3/3)

if (do_expensive_check) {
Expand Down