From 8e427c409bd6964deeeba65c5f0b62a1b09c3ffa Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Tue, 17 May 2022 10:27:45 -0500 Subject: [PATCH 1/5] Don't allow unsanitary lists when doing full equality comparison in tests. --- cpp/tests/quantiles/percentile_approx_test.cu | 3 +- cpp/tests/utilities/column_utilities.cu | 86 +++++++++++++------ 2 files changed, 62 insertions(+), 27 deletions(-) diff --git a/cpp/tests/quantiles/percentile_approx_test.cu b/cpp/tests/quantiles/percentile_approx_test.cu index 9af42e1589d..ba30538a3d5 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cu +++ b/cpp/tests/quantiles/percentile_approx_test.cu @@ -404,7 +404,8 @@ TEST_F(PercentileApproxTest, EmptyInput) 3, cudf::test::detail::make_null_mask(nulls.begin(), nulls.end())); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); + // TODO: change percentile_approx to produce sanitary list outputs for this case. + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); } TEST_F(PercentileApproxTest, EmptyPercentiles) diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 015178f8c7c..ca9a636c743 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -62,10 +62,50 @@ namespace test { namespace { -// expand all non-null rows in a list column into a column of child row indices. +std::unique_ptr generate_all_row_indices(size_type num_rows) +{ + auto indices = + cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED); + thrust::sequence(rmm::exec_policy(), + indices->mutable_view().begin(), + indices->mutable_view().end(), + 0); + return indices; +} + +// generate the rows indices that should be checked for the child column of a list column. +// +// - if we are just checking for equivalence, we can skip any rows that are nulls. this allows +// things like non-empty rows that have been nullified after creation. they may actually contain +// values, but since the row is null they don't matter for equivalency. +// +// - if we are checking for exact equality, we need to check all rows. +// +// This allows us to differentiate between: +// +// List: +// Length : 1 +// Offsets : 0, 4 +// Null count: 1 +// 0 +// 0, 1, 2, 3 +// +// List: +// Length : 1 +// Offsets : 0, 0 +// Null count: 1 +// 0 +// std::unique_ptr generate_child_row_indices(lists_column_view const& c, - column_view const& row_indices) + column_view const& row_indices, + bool check_exact_equality) { + // if we are checking for exact equality, we should be checking for "unsanitized" data that may + // be hiding underneath nulls. so check all rows instead of just non-null rows + if (check_exact_equality) { + return generate_all_row_indices(c.get_sliced_child(rmm::cuda_stream_default).size()); + } + // Example input // List: // Length : 7 @@ -280,13 +320,16 @@ struct column_property_comparator { cudf::lists_column_view rhs_l(rhs); // recurse - auto lhs_child = lhs_l.get_sliced_child(rmm::cuda_stream_default); - // note: if a column is all nulls or otherwise empty, no indices are generated and no recursion - // happens - auto lhs_child_indices = generate_child_row_indices(lhs_l, lhs_row_indices); + + // note: if a column is all nulls (and we are checking for exact equality) or otherwise empty, + // no indices are generated and no recursion happens + auto lhs_child_indices = + generate_child_row_indices(lhs_l, lhs_row_indices, check_exact_equality); if (lhs_child_indices->size() > 0) { - auto rhs_child = rhs_l.get_sliced_child(rmm::cuda_stream_default); - auto rhs_child_indices = generate_child_row_indices(rhs_l, rhs_row_indices); + auto lhs_child = lhs_l.get_sliced_child(rmm::cuda_stream_default); + auto rhs_child = rhs_l.get_sliced_child(rmm::cuda_stream_default); + auto rhs_child_indices = + generate_child_row_indices(rhs_l, rhs_row_indices, check_exact_equality); return cudf::type_dispatcher(lhs_child.type(), column_property_comparator{}, lhs_child, @@ -647,14 +690,16 @@ struct column_comparator_impl { return false; } - // recurse. - auto lhs_child = lhs_l.get_sliced_child(rmm::cuda_stream_default); - // note: if a column is all nulls or otherwise empty, no indices are generated and no recursion - // happens - auto lhs_child_indices = generate_child_row_indices(lhs_l, lhs_row_indices); + // recurse + // note: if a column is all nulls (and we are only checking for equivalence) or otherwise empty, + // no indices are generated and no recursion happens + auto lhs_child_indices = + generate_child_row_indices(lhs_l, lhs_row_indices, check_exact_equality); if (lhs_child_indices->size() > 0) { - auto rhs_child = rhs_l.get_sliced_child(rmm::cuda_stream_default); - auto rhs_child_indices = generate_child_row_indices(rhs_l, rhs_row_indices); + auto lhs_child = lhs_l.get_sliced_child(rmm::cuda_stream_default); + auto rhs_child = rhs_l.get_sliced_child(rmm::cuda_stream_default); + auto rhs_child_indices = + generate_child_row_indices(rhs_l, rhs_row_indices, check_exact_equality); return cudf::type_dispatcher(lhs_child.type(), column_comparator{}, lhs_child, @@ -733,17 +778,6 @@ struct column_comparator { } }; -std::unique_ptr generate_all_row_indices(size_type num_rows) -{ - auto indices = - cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED); - thrust::sequence(rmm::exec_policy(), - indices->mutable_view().begin(), - indices->mutable_view().end(), - 0); - return indices; -} - } // namespace /** From 6f20695fe8ab41beab128edaed419d81a1657e40 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Tue, 17 May 2022 15:23:52 -0500 Subject: [PATCH 2/5] Add tests and a new function to lists_column_wrapper: make_one_empty_row_column(). --- cpp/include/cudf_test/column_wrapper.hpp | 37 +++++++++++++++++++ .../column_utilities_tests.cpp | 33 +++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index ff2ff2a0961..cf389eb3cc0 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -1441,7 +1441,44 @@ class lists_column_wrapper : public detail::column_wrapper { build_from_nested(elements, validity); } + /** + * @brief Construct a list column containing a single empty, optionally null row. + * + * @param valid Whether or not the empty row is also null + */ + static lists_column_wrapper make_one_empty_row_column(bool valid = true) + { + cudf::test::fixed_width_column_wrapper offsets{0, 0}; + cudf::test::fixed_width_column_wrapper values{}; + return lists_column_wrapper( + 1, + offsets.release(), + values.release(), + !valid ? 1 : 0, + !valid ? cudf::create_null_mask(1, cudf::mask_state::ALL_NULL) : rmm::device_buffer{}); + } + private: + /** + * @brief Construct a list column from constituent parts. + * + * @param num_rows The number of lists the column represents + * @param offsets The column of offset values for this column + * @param values The column of values bounded by the offsets + * @param null_count The number of null list entries + * @param null_mask The bits specifying the null lists in device memory + */ + lists_column_wrapper(size_type num_rows, + std::unique_ptr&& offsets, + std::unique_ptr&& values, + size_type null_count, + rmm::device_buffer&& null_mask) + { + // construct the list column + wrapped = make_lists_column( + num_rows, std::move(offsets), std::move(values), null_count, std::move(null_mask)); + } + /** * @brief Initialize as a nested list column composed of other list columns. * diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp index fb4125d1752..63100d961c4 100644 --- a/cpp/tests/utilities_tests/column_utilities_tests.cpp +++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp @@ -364,6 +364,39 @@ TEST_F(ColumnUtilitiesListsTest, Equivalence) } } +TEST_F(ColumnUtilitiesListsTest, UnsanitaryLists) +{ + // unsanitary + // + // List: + // Length : 1 + // Offsets : 0, 3 + // Null count: 1 + // 0 + // 0, 1, 2 + cudf::test::fixed_width_column_wrapper offsets{0, 3}; + cudf::test::fixed_width_column_wrapper values{0, 1, 2}; + auto l0 = cudf::make_lists_column(1, + offsets.release(), + values.release(), + 1, + cudf::create_null_mask(1, cudf::mask_state::ALL_NULL)); + + // sanitary + // + // List: + // Length : 1 + // Offsets : 0, 0 + // Null count: 1 + // 0 + auto l1 = cudf::test::lists_column_wrapper::make_one_empty_row_column(false); + + // equivalent, but not equal + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*l0, l1); + EXPECT_EQ(cudf::test::expect_columns_equal(*l0, l1, cudf::test::debug_output_level::QUIET), + false); +} + TEST_F(ColumnUtilitiesListsTest, DifferentPhysicalStructure) { // list From 59eb7541087665d32782450f668f8de6da68eac7 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 18 May 2022 09:37:05 -0500 Subject: [PATCH 3/5] PR review changes. --- cpp/include/cudf_test/column_wrapper.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index cf389eb3cc0..80cf9cf6611 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -1454,7 +1454,7 @@ class lists_column_wrapper : public detail::column_wrapper { 1, offsets.release(), values.release(), - !valid ? 1 : 0, + static_cast(!valid), !valid ? cudf::create_null_mask(1, cudf::mask_state::ALL_NULL) : rmm::device_buffer{}); } From 868d6207089c2ed3ae1cb7ca5cf2991c8e02b244 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Thu, 19 May 2022 10:56:08 -0500 Subject: [PATCH 4/5] PR review tweaks. --- cpp/include/cudf_test/column_wrapper.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index 80cf9cf6611..8068373ca65 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -1454,8 +1454,8 @@ class lists_column_wrapper : public detail::column_wrapper { 1, offsets.release(), values.release(), - static_cast(!valid), - !valid ? cudf::create_null_mask(1, cudf::mask_state::ALL_NULL) : rmm::device_buffer{}); + valid ? 0 : 1, + valid ? rmm::device_buffer{} : cudf::create_null_mask(1, cudf::mask_state::ALL_NULL)); } private: From 4bbc42194cee2b914d3bf2557572871740d49abb Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Thu, 19 May 2022 11:03:57 -0500 Subject: [PATCH 5/5] Change EXPECT_EQ(...false) to EXPECT_FALSE. --- cpp/tests/utilities_tests/column_utilities_tests.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp index 63100d961c4..ec87fde0f51 100644 --- a/cpp/tests/utilities_tests/column_utilities_tests.cpp +++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp @@ -393,8 +393,7 @@ TEST_F(ColumnUtilitiesListsTest, UnsanitaryLists) // equivalent, but not equal CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*l0, l1); - EXPECT_EQ(cudf::test::expect_columns_equal(*l0, l1, cudf::test::debug_output_level::QUIET), - false); + EXPECT_FALSE(cudf::test::expect_columns_equal(*l0, l1, cudf::test::debug_output_level::QUIET)); } TEST_F(ColumnUtilitiesListsTest, DifferentPhysicalStructure)