Skip to content

Commit

Permalink
Fix inconsistency when hashing two tables in cudf::detail::contains (
Browse files Browse the repository at this point in the history
…#11284)

When hashing elements in a column, the nullable information needs to be taken into account. This could produce different results for the hash values of the same elements if the nullable condition is changed. As such, whenever we need to compute hashing for more than one column/table in the same operation, we need to take into account the nullable of all columns/tables, not just the one that is being hashed.

This PR fixes a bug when hashing two tables using different nullable values in `cudf::detail::contains`, which led to incorrect results when there is only one nullable table in the input.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #11284
  • Loading branch information
ttnghia authored Jul 18, 2022
1 parent 9627091 commit 5f9da83
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 6 deletions.
9 changes: 5 additions & 4 deletions cpp/src/search/contains_table.cu
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,12 @@ rmm::device_uvector<bool> contains(table_view const& haystack,

auto const haystack_has_nulls = has_nested_nulls(haystack);
auto const needles_has_nulls = has_nested_nulls(needles);
auto const has_any_nulls = haystack_has_nulls || needles_has_nulls;

// Insert all row hash values and indices of the haystack table.
{
auto const hasher = cudf::experimental::row::hash::row_hasher(haystack, stream);
auto const d_hasher = hasher.device_hasher(nullate::DYNAMIC{haystack_has_nulls});
auto const d_hasher = hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});

using make_pair_fn = make_pair_function<decltype(d_hasher), lhs_index_type>;

Expand Down Expand Up @@ -110,7 +111,7 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
// Check existence for each row of the needles table in the haystack table.
{
auto const hasher = cudf::experimental::row::hash::row_hasher(needles, stream);
auto const d_hasher = hasher.device_hasher(nullate::DYNAMIC{needles_has_nulls});
auto const d_hasher = hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});

auto const comparator =
cudf::experimental::row::equality::two_table_comparator(haystack, needles, stream);
Expand All @@ -121,8 +122,8 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
size_type{0}, make_pair_fn{d_hasher, map.get_empty_key_sentinel()});

auto const check_contains = [&](auto const value_comp) {
auto const d_eqcomp = comparator.equal_to(
nullate::DYNAMIC{needles_has_nulls || haystack_has_nulls}, compare_nulls, value_comp);
auto const d_eqcomp =
comparator.equal_to(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
map.pair_contains(needles_it,
needles_it + needles.num_rows(),
contained.begin(),
Expand Down
31 changes: 29 additions & 2 deletions cpp/tests/join/semi_anti_join_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -26,6 +26,7 @@
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/iterator_utilities.hpp>
#include <cudf_test/table_utilities.hpp>

#include <thrust/iterator/transform_iterator.h>
Expand All @@ -52,7 +53,7 @@ TEST_F(JoinTest, TestSimple)
cudf::data_type{cudf::type_to_id<cudf::size_type>()}, result->size(), result->data());
column_wrapper<cudf::size_type> expected{0, 1};
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result_cv);
};
}

std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> get_saj_tables(
std::vector<bool> const& left_is_human_nulls, std::vector<bool> const& right_is_human_nulls)
Expand Down Expand Up @@ -230,3 +231,29 @@ TEST_F(JoinTest, AntiJoinWithStructsAndNullsNotEqual)

CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
}

TEST_F(JoinTest, AntiJoinWithStructsAndNullsOnOneSide)
{
auto constexpr null{0};
auto left_col0 = [] {
column_wrapper<int32_t> child1{{1, null}, cudf::test::iterators::null_at(1)};
column_wrapper<int32_t> child2{11, 12};
return cudf::test::structs_column_wrapper{{child1, child2}};
}();
auto right_col0 = [] {
column_wrapper<int32_t> child1{1, 2, 3, 4};
column_wrapper<int32_t> child2{11, 12, 13, 14};
return cudf::test::structs_column_wrapper{{child1, child2}};
}();

auto left = cudf::table_view{{left_col0}};
auto right = cudf::table_view{{right_col0}};

auto result = cudf::left_anti_join(left, right, {0}, {0});
auto expected = [] {
column_wrapper<int32_t> child1{{null}, cudf::test::iterators::null_at(0)};
column_wrapper<int32_t> child2{12};
return cudf::test::structs_column_wrapper{{child1, child2}};
}();
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0).view());
}

0 comments on commit 5f9da83

Please sign in to comment.