Skip to content

Commit

Permalink
Support nan_equality in cudf::distinct (#11118)
Browse files Browse the repository at this point in the history
This adds `nan_equality` parameter to `cudf::distinct`, allowing to specify the desired behavior when dealing with floating-point data: `NaN` should be compared equally to other `NaN` or not.

Depends on #11052 (built on top of it).
Closes #11092.
This is a blocker for set-like operations (#11043) and also the last blocker for #11053.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #11118
  • Loading branch information
ttnghia authored Jun 23, 2022
1 parent 6362fbe commit 80d7cc7
Show file tree
Hide file tree
Showing 11 changed files with 272 additions and 36 deletions.
9 changes: 7 additions & 2 deletions cpp/benchmarks/stream_compaction/distinct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL,
stream_view);
});
}
Expand Down Expand Up @@ -90,8 +91,12 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
auto result = cudf::detail::distinct(
*table, {0}, cudf::duplicate_keep_option::KEEP_ANY, cudf::null_equality::EQUAL, stream_view);
auto result = cudf::detail::distinct(*table,
{0},
cudf::duplicate_keep_option::KEEP_ANY,
cudf::null_equality::EQUAL,
cudf::nan_equality::ALL_EQUAL,
stream_view);
});
}

Expand Down
11 changes: 7 additions & 4 deletions cpp/include/cudf/detail/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ std::unique_ptr<table> distinct(
std::vector<size_type> const& keys,
duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::ALL_EQUAL,
rmm::cuda_stream_view stream = cudf::default_stream_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand All @@ -95,17 +96,19 @@ std::unique_ptr<table> distinct(
* generated. If there are duplicate rows, which index is kept depends on the `keep` parameter.
*
* @param input The input table
* @param keep Get index of the first, last, any, or none row among the found duplicates rows
* @param keep Get index of any, first, last, or none of the found duplicates
* @param nulls_equal Flag to specify whether null elements should be considered as equal
* @param nans_equal Flag to specify whether NaN elements should be considered as equal
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned vector
* @return A device_uvector containing the result indices
*/
rmm::device_uvector<size_type> get_distinct_indices(
table_view const& input,
duplicate_keep_option keep,
null_equality nulls_equal,
rmm::cuda_stream_view stream,
duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::ALL_EQUAL,
rmm::cuda_stream_view stream = cudf::default_stream_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down
5 changes: 3 additions & 2 deletions cpp/include/cudf/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,8 @@ std::unique_ptr<table> unique(
* @param[in] input input table_view to copy only distinct rows
* @param[in] keys vector of indices representing key columns from `input`
* @param[in] keep keep any, first, last, or none of the found duplicates
* @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not
* equal if null_equality::UNEQUAL
* @param[in] nulls_equal flag to control if nulls are compared equal or not
* @param[in] nans_equal flag to control if floating-point NaN values are compared equal or not
* @param[in] mr Device memory resource used to allocate the returned table's device
* memory
*
Expand All @@ -275,6 +275,7 @@ std::unique_ptr<table> distinct(
std::vector<size_type> const& keys,
duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::ALL_EQUAL,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down
1 change: 1 addition & 0 deletions cpp/src/dictionary/add_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ std::unique_ptr<column> add_keys(
std::vector<size_type>{0}, // only one key column
duplicate_keep_option::KEEP_ANY,
null_equality::EQUAL,
nan_equality::ALL_EQUAL,
stream,
mr);
std::vector<order> column_order{order::ASCENDING};
Expand Down
1 change: 1 addition & 0 deletions cpp/src/dictionary/detail/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
std::vector<size_type>{0},
duplicate_keep_option::KEEP_ANY,
null_equality::EQUAL,
nan_equality::ALL_EQUAL,
stream,
mr);
auto sorted_keys = cudf::detail::sort(table_keys->view(),
Expand Down
1 change: 1 addition & 0 deletions cpp/src/dictionary/set_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ std::unique_ptr<column> set_keys(
std::vector<size_type>{0},
duplicate_keep_option::KEEP_ANY,
null_equality::EQUAL,
nan_equality::ALL_EQUAL,
stream,
mr);
auto sorted_keys = cudf::detail::sort(distinct_keys->view(),
Expand Down
39 changes: 31 additions & 8 deletions cpp/src/stream_compaction/distinct.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ namespace detail {
rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
duplicate_keep_option keep,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
Expand All @@ -59,12 +60,24 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));

auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal);
auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);

auto const pair_iter = cudf::detail::make_counting_transform_iterator(
size_type{0}, [] __device__(size_type const i) { return cuco::make_pair(i, i); });
map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());

auto const insert_keys = [&](auto const value_comp) {
auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp);
map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
};

if (nans_equal == nan_equality::ALL_EQUAL) {
using nan_equal_comparator =
cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
insert_keys(nan_equal_comparator{});
} else {
using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
insert_keys(nan_unequal_comparator{});
}

auto output_indices = rmm::device_uvector<size_type>(map.get_size(), stream, mr);

Expand All @@ -74,9 +87,15 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
return output_indices;
}

// For other keep options, perform a (sparse) reduce-by-row on the rows compared equal.
auto const reduction_results = hash_reduce_by_row(
map, std::move(preprocessed_input), input.num_rows(), has_nulls, keep, nulls_equal, stream);
// For other keep options, reduce by row on rows that compare equal.
auto const reduction_results = hash_reduce_by_row(map,
std::move(preprocessed_input),
input.num_rows(),
has_nulls,
keep,
nulls_equal,
nans_equal,
stream);

// Extract the desired output indices from reduction results.
auto const map_end = [&] {
Expand Down Expand Up @@ -111,14 +130,16 @@ std::unique_ptr<table> distinct(table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option keep,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
return empty_like(input);
}

auto const gather_map = get_distinct_indices(input.select(keys), keep, nulls_equal, stream);
auto const gather_map =
get_distinct_indices(input.select(keys), keep, nulls_equal, nans_equal, stream);
return detail::gather(input,
gather_map,
out_of_bounds_policy::DONT_CHECK,
Expand All @@ -133,10 +154,12 @@ std::unique_ptr<table> distinct(table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option keep,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::distinct(input, keys, keep, nulls_equal, cudf::default_stream_value, mr);
return detail::distinct(
input, keys, keep, nulls_equal, nans_equal, cudf::default_stream_value, mr);
}

} // namespace cudf
30 changes: 21 additions & 9 deletions cpp/src/stream_compaction/distinct_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ rmm::device_uvector<size_type> hash_reduce_by_row(
cudf::nullate::DYNAMIC has_nulls,
duplicate_keep_option keep,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
Expand All @@ -111,15 +112,26 @@ rmm::device_uvector<size_type> hash_reduce_by_row(
auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));

auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal);

thrust::for_each(
rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(num_rows),
reduce_by_row_fn{
map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);

auto const reduce_by_row = [&](auto const value_comp) {
auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp);
thrust::for_each(
rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(num_rows),
reduce_by_row_fn{
map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
};

if (nans_equal == nan_equality::ALL_EQUAL) {
using nan_equal_comparator =
cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
reduce_by_row(nan_equal_comparator{});
} else {
using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
reduce_by_row(nan_unequal_comparator{});
}

return reduction_results;
}
Expand Down
1 change: 1 addition & 0 deletions cpp/src/stream_compaction/distinct_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ rmm::device_uvector<size_type> hash_reduce_by_row(
cudf::nullate::DYNAMIC has_nulls,
duplicate_keep_option keep,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
9 changes: 7 additions & 2 deletions cpp/src/transform/encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,13 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
std::vector<size_type> drop_keys(num_cols);
std::iota(drop_keys.begin(), drop_keys.end(), 0);

auto distinct_keys = cudf::detail::distinct(
input_table, drop_keys, duplicate_keep_option::KEEP_ANY, null_equality::EQUAL, stream, mr);
auto distinct_keys = cudf::detail::distinct(input_table,
drop_keys,
duplicate_keep_option::KEEP_ANY,
null_equality::EQUAL,
nan_equality::ALL_EQUAL,
stream,
mr);

std::vector<order> column_order(num_cols, order::ASCENDING);
std::vector<null_order> null_precedence(num_cols, null_order::AFTER);
Expand Down
Loading

0 comments on commit 80d7cc7

Please sign in to comment.