From c0895c1720d864df85c81d7c094bb9cb475d1e57 Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Thu, 19 May 2022 13:39:43 -0500 Subject: [PATCH] Make cudf::test::expect_columns_equal() to fail when comparing unsanitary lists. (#10880) Fixes: https://github.com/rapidsai/cudf/issues/10855 Unsanitary lists (those with null rows that actually have backing underlying data) should not compare as equal to sanitized lists, but _should_ compare as equivalent. Only one test in cudf was affected by this change, which I've opened a second issue for: https://github.com/rapidsai/cudf/issues/10856 This PR also adds a utility function to `cudf::test::lists_column_wrapper`, `static lists_column_wrapper make_one_empty_row_column(bool valid = true)` Due to the way initializer lists and nested class initialization work, it was impossible to create a list column with 1 row that was empty, so I added this explicit function to handle the edge case. Authors: - https://github.com/nvdbaranec Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Mark Harris (https://github.com/harrism) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/10880 --- cpp/include/cudf_test/column_wrapper.hpp | 37 ++++++++ cpp/tests/quantiles/percentile_approx_test.cu | 3 +- cpp/tests/utilities/column_utilities.cu | 86 +++++++++++++------ .../column_utilities_tests.cpp | 32 +++++++ 4 files changed, 131 insertions(+), 27 deletions(-) diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index ff2ff2a0961..8068373ca65 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -1441,7 +1441,44 @@ class lists_column_wrapper : public detail::column_wrapper { build_from_nested(elements, validity); } + /** + * @brief Construct a list column containing a single empty, optionally null row. + * + * @param valid Whether or not the empty row is also null + */ + static lists_column_wrapper make_one_empty_row_column(bool valid = true) + { + cudf::test::fixed_width_column_wrapper offsets{0, 0}; + cudf::test::fixed_width_column_wrapper values{}; + return lists_column_wrapper( + 1, + offsets.release(), + values.release(), + valid ? 0 : 1, + valid ? rmm::device_buffer{} : cudf::create_null_mask(1, cudf::mask_state::ALL_NULL)); + } + private: + /** + * @brief Construct a list column from constituent parts. + * + * @param num_rows The number of lists the column represents + * @param offsets The column of offset values for this column + * @param values The column of values bounded by the offsets + * @param null_count The number of null list entries + * @param null_mask The bits specifying the null lists in device memory + */ + lists_column_wrapper(size_type num_rows, + std::unique_ptr&& offsets, + std::unique_ptr&& values, + size_type null_count, + rmm::device_buffer&& null_mask) + { + // construct the list column + wrapped = make_lists_column( + num_rows, std::move(offsets), std::move(values), null_count, std::move(null_mask)); + } + /** * @brief Initialize as a nested list column composed of other list columns. * diff --git a/cpp/tests/quantiles/percentile_approx_test.cu b/cpp/tests/quantiles/percentile_approx_test.cu index 9af42e1589d..ba30538a3d5 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cu +++ b/cpp/tests/quantiles/percentile_approx_test.cu @@ -404,7 +404,8 @@ TEST_F(PercentileApproxTest, EmptyInput) 3, cudf::test::detail::make_null_mask(nulls.begin(), nulls.end())); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); + // TODO: change percentile_approx to produce sanitary list outputs for this case. + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); } TEST_F(PercentileApproxTest, EmptyPercentiles) diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 015178f8c7c..ca9a636c743 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -62,10 +62,50 @@ namespace test { namespace { -// expand all non-null rows in a list column into a column of child row indices. +std::unique_ptr generate_all_row_indices(size_type num_rows) +{ + auto indices = + cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED); + thrust::sequence(rmm::exec_policy(), + indices->mutable_view().begin(), + indices->mutable_view().end(), + 0); + return indices; +} + +// generate the rows indices that should be checked for the child column of a list column. +// +// - if we are just checking for equivalence, we can skip any rows that are nulls. this allows +// things like non-empty rows that have been nullified after creation. they may actually contain +// values, but since the row is null they don't matter for equivalency. +// +// - if we are checking for exact equality, we need to check all rows. +// +// This allows us to differentiate between: +// +// List: +// Length : 1 +// Offsets : 0, 4 +// Null count: 1 +// 0 +// 0, 1, 2, 3 +// +// List: +// Length : 1 +// Offsets : 0, 0 +// Null count: 1 +// 0 +// std::unique_ptr generate_child_row_indices(lists_column_view const& c, - column_view const& row_indices) + column_view const& row_indices, + bool check_exact_equality) { + // if we are checking for exact equality, we should be checking for "unsanitized" data that may + // be hiding underneath nulls. so check all rows instead of just non-null rows + if (check_exact_equality) { + return generate_all_row_indices(c.get_sliced_child(rmm::cuda_stream_default).size()); + } + // Example input // List: // Length : 7 @@ -280,13 +320,16 @@ struct column_property_comparator { cudf::lists_column_view rhs_l(rhs); // recurse - auto lhs_child = lhs_l.get_sliced_child(rmm::cuda_stream_default); - // note: if a column is all nulls or otherwise empty, no indices are generated and no recursion - // happens - auto lhs_child_indices = generate_child_row_indices(lhs_l, lhs_row_indices); + + // note: if a column is all nulls (and we are checking for exact equality) or otherwise empty, + // no indices are generated and no recursion happens + auto lhs_child_indices = + generate_child_row_indices(lhs_l, lhs_row_indices, check_exact_equality); if (lhs_child_indices->size() > 0) { - auto rhs_child = rhs_l.get_sliced_child(rmm::cuda_stream_default); - auto rhs_child_indices = generate_child_row_indices(rhs_l, rhs_row_indices); + auto lhs_child = lhs_l.get_sliced_child(rmm::cuda_stream_default); + auto rhs_child = rhs_l.get_sliced_child(rmm::cuda_stream_default); + auto rhs_child_indices = + generate_child_row_indices(rhs_l, rhs_row_indices, check_exact_equality); return cudf::type_dispatcher(lhs_child.type(), column_property_comparator{}, lhs_child, @@ -647,14 +690,16 @@ struct column_comparator_impl { return false; } - // recurse. - auto lhs_child = lhs_l.get_sliced_child(rmm::cuda_stream_default); - // note: if a column is all nulls or otherwise empty, no indices are generated and no recursion - // happens - auto lhs_child_indices = generate_child_row_indices(lhs_l, lhs_row_indices); + // recurse + // note: if a column is all nulls (and we are only checking for equivalence) or otherwise empty, + // no indices are generated and no recursion happens + auto lhs_child_indices = + generate_child_row_indices(lhs_l, lhs_row_indices, check_exact_equality); if (lhs_child_indices->size() > 0) { - auto rhs_child = rhs_l.get_sliced_child(rmm::cuda_stream_default); - auto rhs_child_indices = generate_child_row_indices(rhs_l, rhs_row_indices); + auto lhs_child = lhs_l.get_sliced_child(rmm::cuda_stream_default); + auto rhs_child = rhs_l.get_sliced_child(rmm::cuda_stream_default); + auto rhs_child_indices = + generate_child_row_indices(rhs_l, rhs_row_indices, check_exact_equality); return cudf::type_dispatcher(lhs_child.type(), column_comparator{}, lhs_child, @@ -733,17 +778,6 @@ struct column_comparator { } }; -std::unique_ptr generate_all_row_indices(size_type num_rows) -{ - auto indices = - cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED); - thrust::sequence(rmm::exec_policy(), - indices->mutable_view().begin(), - indices->mutable_view().end(), - 0); - return indices; -} - } // namespace /** diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp index fb4125d1752..ec87fde0f51 100644 --- a/cpp/tests/utilities_tests/column_utilities_tests.cpp +++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp @@ -364,6 +364,38 @@ TEST_F(ColumnUtilitiesListsTest, Equivalence) } } +TEST_F(ColumnUtilitiesListsTest, UnsanitaryLists) +{ + // unsanitary + // + // List: + // Length : 1 + // Offsets : 0, 3 + // Null count: 1 + // 0 + // 0, 1, 2 + cudf::test::fixed_width_column_wrapper offsets{0, 3}; + cudf::test::fixed_width_column_wrapper values{0, 1, 2}; + auto l0 = cudf::make_lists_column(1, + offsets.release(), + values.release(), + 1, + cudf::create_null_mask(1, cudf::mask_state::ALL_NULL)); + + // sanitary + // + // List: + // Length : 1 + // Offsets : 0, 0 + // Null count: 1 + // 0 + auto l1 = cudf::test::lists_column_wrapper::make_one_empty_row_column(false); + + // equivalent, but not equal + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*l0, l1); + EXPECT_FALSE(cudf::test::expect_columns_equal(*l0, l1, cudf::test::debug_output_level::QUIET)); +} + TEST_F(ColumnUtilitiesListsTest, DifferentPhysicalStructure) { // list