Skip to content

Commit

Permalink
Sort local neighbors in the graph adjacency list. (#1886)
Browse files Browse the repository at this point in the history
This is necessary for implementing Node2Vec and triangle counting (and also improves edge traversal performance as this improves locality of row/column property accesses).

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Andrei Schaffer (https://github.com/aschaffer)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: #1886
  • Loading branch information
seunghwak authored Oct 18, 2021
1 parent 878489a commit 4cd2a55
Showing 1 changed file with 170 additions and 0 deletions.
170 changes: 170 additions & 0 deletions cpp/src/structure/graph_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <thrust/binary_search.h>
#include <thrust/fill.h>
#include <thrust/for_each.h>
#include <thrust/gather.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/tuple.h>
Expand Down Expand Up @@ -76,6 +77,14 @@ struct has_nzd_t {
}
};

// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an
// extended __device__ lambda must allow its address to be taken)
template <typename edge_t>
struct rebase_offset_t {
edge_t base_offset{};
__device__ edge_t operator()(edge_t offset) const { return offset - base_offset; }
};

// compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid
template <bool store_transposed, typename vertex_t, typename edge_t, typename weight_t>
std::tuple<rmm::device_uvector<edge_t>,
Expand Down Expand Up @@ -203,6 +212,145 @@ compress_edgelist(edgelist_t<vertex_t, edge_t, weight_t> const& edgelist,
std::move(offsets), std::move(indices), std::move(weights), std::move(dcs_nzd_vertices));
}

template <typename vertex_t, typename edge_t, typename weight_t>
void sort_adjacency_list(raft::handle_t const& handle,
edge_t const* offsets,
vertex_t* indices /* [INOUT} */,
std::optional<weight_t*> weights /* [INOUT] */,
vertex_t num_vertices,
edge_t num_edges)
{
// FIXME: The current cub's segmented sort based implementation is slower than the global sort
// based approach, but we expect cub's segmented sort performance will get significantly better in
// few months. We also need to re-evaluate performance & memory overhead of presorting edge list
// and running thrust::reduce to update offset vs the current approach after updating the python
// interface. If we take r-values of rmm::device_uvector edge list, we can do indcies_ =
// std::move(minors) & weights_ = std::move (weights). This affects peak memory use and we may
// find the presorting approach more attractive under this scenario.

// 1. We segmented sort edges in chunks, and we need to adjust chunk offsets as we need to sort
// each vertex's neighbors at once.

// to limit memory footprint ((1 << 20) is a tuning parameter)
auto approx_edges_to_sort_per_iteration =
static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 20);
auto search_offset_first =
thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{1}),
[approx_edges_to_sort_per_iteration] __device__(auto i) {
return i * approx_edges_to_sort_per_iteration;
});
auto num_chunks =
(num_edges + approx_edges_to_sort_per_iteration - 1) / approx_edges_to_sort_per_iteration;
rmm::device_uvector<vertex_t> d_vertex_offsets(num_chunks - 1, handle.get_stream());
thrust::lower_bound(handle.get_thrust_policy(),
offsets,
offsets + num_vertices + 1,
search_offset_first,
search_offset_first + d_vertex_offsets.size(),
d_vertex_offsets.begin());
rmm::device_uvector<edge_t> d_edge_offsets(d_vertex_offsets.size(), handle.get_stream());
thrust::gather(handle.get_thrust_policy(),
d_vertex_offsets.begin(),
d_vertex_offsets.end(),
offsets,
d_edge_offsets.begin());
std::vector<edge_t> h_edge_offsets(num_chunks + 1, edge_t{0});
h_edge_offsets.back() = num_edges;
raft::update_host(
h_edge_offsets.data() + 1, d_edge_offsets.data(), d_edge_offsets.size(), handle.get_stream());
std::vector<vertex_t> h_vertex_offsets(num_chunks + 1, vertex_t{0});
h_vertex_offsets.back() = num_vertices;
raft::update_host(h_vertex_offsets.data() + 1,
d_vertex_offsets.data(),
d_vertex_offsets.size(),
handle.get_stream());

// 2. Segmented sort each vertex's neighbors

size_t max_chunk_size{0};
for (size_t i = 0; i < num_chunks; ++i) {
max_chunk_size =
std::max(max_chunk_size, static_cast<size_t>(h_edge_offsets[i + 1] - h_edge_offsets[i]));
}
rmm::device_uvector<vertex_t> segment_sorted_indices(max_chunk_size, handle.get_stream());
auto segment_sorted_weights =
weights ? std::make_optional<rmm::device_uvector<weight_t>>(max_chunk_size, handle.get_stream())
: std::nullopt;
rmm::device_uvector<std::byte> d_temp_storage(0, handle.get_stream());
for (size_t i = 0; i < num_chunks; ++i) {
size_t temp_storage_bytes{0};
auto offset_first = thrust::make_transform_iterator(offsets + h_vertex_offsets[i],
rebase_offset_t<edge_t>{h_edge_offsets[i]});
if (weights) {
cub::DeviceSegmentedRadixSort::SortPairs(static_cast<void*>(nullptr),
temp_storage_bytes,
indices + h_edge_offsets[i],
segment_sorted_indices.data(),
(*weights) + h_edge_offsets[i],
(*segment_sorted_weights).data(),
h_edge_offsets[i + 1] - h_edge_offsets[i],
h_vertex_offsets[i + 1] - h_vertex_offsets[i],
offset_first,
offset_first + 1,
0,
sizeof(vertex_t) * 8,
handle.get_stream());
} else {
cub::DeviceSegmentedRadixSort::SortKeys(static_cast<void*>(nullptr),
temp_storage_bytes,
indices + h_edge_offsets[i],
segment_sorted_indices.data(),
h_edge_offsets[i + 1] - h_edge_offsets[i],
h_vertex_offsets[i + 1] - h_vertex_offsets[i],
offset_first,
offset_first + 1,
0,
sizeof(vertex_t) * 8,
handle.get_stream());
}
if (temp_storage_bytes > d_temp_storage.size()) {
d_temp_storage = rmm::device_uvector<std::byte>(temp_storage_bytes, handle.get_stream());
}
if (weights) {
cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage.data(),
temp_storage_bytes,
indices + h_edge_offsets[i],
segment_sorted_indices.data(),
(*weights) + h_edge_offsets[i],
(*segment_sorted_weights).data(),
h_edge_offsets[i + 1] - h_edge_offsets[i],
h_vertex_offsets[i + 1] - h_vertex_offsets[i],
offset_first,
offset_first + 1,
0,
sizeof(vertex_t) * 8,
handle.get_stream());
} else {
cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage.data(),
temp_storage_bytes,
indices + h_edge_offsets[i],
segment_sorted_indices.data(),
h_edge_offsets[i + 1] - h_edge_offsets[i],
h_vertex_offsets[i + 1] - h_vertex_offsets[i],
offset_first,
offset_first + 1,
0,
sizeof(vertex_t) * 8,
handle.get_stream());
}
thrust::copy(handle.get_thrust_policy(),
segment_sorted_indices.begin(),
segment_sorted_indices.begin() + (h_edge_offsets[i + 1] - h_edge_offsets[i]),
indices + h_edge_offsets[i]);
if (weights) {
thrust::copy(handle.get_thrust_policy(),
(*segment_sorted_weights).begin(),
(*segment_sorted_weights).begin() + (h_edge_offsets[i + 1] - h_edge_offsets[i]),
(*weights) + h_edge_offsets[i]);
}
}
}

} // namespace

template <typename vertex_t,
Expand Down Expand Up @@ -407,6 +555,19 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
}
}

// segmented sort neighbors

for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) {
sort_adjacency_list(handle,
adj_matrix_partition_offsets_[i].data(),
adj_matrix_partition_indices_[i].data(),
adj_matrix_partition_weights_
? std::optional<weight_t*>{(*adj_matrix_partition_weights_)[i].data()}
: std::nullopt,
static_cast<vertex_t>(adj_matrix_partition_offsets_[i].size() - 1),
static_cast<edge_t>(adj_matrix_partition_indices_[i].size()));
}

// if # unique edge rows/cols << V / row_comm_size|col_comm_size, store unique edge rows/cols to
// support storing edge row/column properties in (key, value) pairs.

Expand Down Expand Up @@ -646,6 +807,15 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
this->get_number_of_vertices(),
default_stream_view);

// segmented sort neighbors

sort_adjacency_list(handle,
offsets_.data(),
indices_.data(),
weights_ ? std::optional<weight_t*>{(*weights_).data()} : std::nullopt,
static_cast<vertex_t>(offsets_.size() - 1),
static_cast<edge_t>(indices_.size()));

// optional expensive checks (part 3/3)

if (do_expensive_check) {
Expand Down

0 comments on commit 4cd2a55

Please sign in to comment.