diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp index f1ce3b7f0e3..e778428510d 100644 --- a/cpp/include/cudf/lists/drop_list_duplicates.hpp +++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp @@ -28,32 +28,32 @@ namespace lists { */ /** - * @brief Create a new lists column by removing duplicated entries from each list element in the - * given lists column + * @brief Create a new lists column by extracting unique entries from list elements in the given + * lists column. * - * @throw cudf::logic_error if any row (list element) in the input column is a nested type. - * - * Given an `input` lists_column_view, the list elements in the column are copied to an output lists + * Given an input lists column, the list elements in the column are copied to an output lists * column such that their duplicated entries are dropped out to keep only the unique ones. The * order of those entries within each list are not guaranteed to be preserved as in the input. In * the current implementation, entries in the output lists are sorted by ascending order (nulls * last), but this is not guaranteed in future implementation. * - * @param lists_column The input lists_column_view - * @param nulls_equal Flag to specify whether null entries should be considered equal - * @param nans_equal Flag to specify whether NaN entries should be considered as equal value (only - * applicable for floating point data column) - * @param mr Device resource used to allocate memory + * @throw cudf::logic_error if the child column of the input lists column contains nested type other + * than struct. + * + * @param lists_column The input lists column to extract lists with unique entries. + * @param nulls_equal Flag to specify whether null entries should be considered equal. + * @param nans_equal Flag to specify whether NaN entries should be considered as equal value (only + * applicable for floating point data column). + * @param mr Device resource used to allocate memory. * * @code{.pseudo} - * lists_column = { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} } + * input = { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} } * output = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} } * - * Note that permuting the entries of each list in this output also produces another valid - * output. + * Note that permuting the entries of each list in this output also produces another valid output. * @endcode * - * @return A list column with list elements having unique entries + * @return A lists column with list elements having unique entries. */ std::unique_ptr drop_list_duplicates( lists_column_view const& lists_column, diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 564d919b65d..e53ae4ff0c1 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include #include @@ -22,6 +24,8 @@ #include #include #include +#include +#include #include #include @@ -36,10 +40,15 @@ namespace lists { namespace detail { namespace { template -struct has_negative_nans { +struct has_negative_nans_fn { column_device_view const d_entries; bool const has_nulls; + has_negative_nans_fn(column_device_view const d_entries, bool const has_nulls) + : d_entries(d_entries), has_nulls(has_nulls) + { + } + __device__ Type operator()(size_type idx) const noexcept { if (has_nulls && d_entries.is_null_nocheck(idx)) { return false; } @@ -50,30 +59,53 @@ struct has_negative_nans { }; /** - * @brief A structure to be used along with type_dispatcher to check if a - * `column_view` has any negative NaN entry + * @brief A structure to be used along with type_dispatcher to check if a column has any + * negative NaN value. + * + * This functor is used to check for replacing negative NaN if there exists one. It is neccessary + * because when calling to `lists::detail::sort_lists`, the negative NaN and positive NaN values (if + * both exist) are separated to the two ends of the output column. This is due to the API + * `lists::detail::sort_lists` internally calls `cub::DeviceSegmentedRadixSort`, which performs + * sorting by comparing bits of the input numbers. Since negative and positive NaN have + * different bits representation, they may not be moved to be close to each other after sorted. */ -struct has_negative_nans_fn { +struct has_negative_nans_dispatch { template >* = nullptr> bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const noexcept { auto const d_entries = column_device_view::create(lists_entries, stream); - return thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(lists_entries.size()), - detail::has_negative_nans{*d_entries, lists_entries.has_nulls()}); + return thrust::count_if( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(lists_entries.size()), + detail::has_negative_nans_fn{*d_entries, lists_entries.has_nulls()}); } - template >* = nullptr> - bool operator()(column_view const&, rmm::cuda_stream_view) const noexcept + template >* = nullptr> + bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const { - // Columns of non floating-point data will never contain NaN + // Recursively check negative NaN on the children columns. + return std::any_of( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(lists_entries.num_children()), + [structs_view = structs_column_view{lists_entries}, stream](auto const child_idx) { + auto const col = structs_view.get_sliced_child(child_idx); + return type_dispatcher(col.type(), detail::has_negative_nans_dispatch{}, col, stream); + }); + } + + template && + !std::is_same_v>* = nullptr> + bool operator()(column_view const&, rmm::cuda_stream_view) const + { + // Columns of non floating-point data will never contain NaN. return false; } }; template -struct replace_negative_nans { +struct replace_negative_nans_fn { __device__ Type operator()(Type val) const noexcept { return std::isnan(val) ? std::numeric_limits::quiet_NaN() : val; @@ -81,58 +113,63 @@ struct replace_negative_nans { }; /** - * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all entries - * of a floating-point data column + * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all rows + * in a floating-point data column. */ -struct replace_negative_nans_fn { - template >* = nullptr> - void operator()(column_view const&, mutable_column_view const&, rmm::cuda_stream_view) const +struct replace_negative_nans_dispatch { + template && + !std::is_same_v>* = nullptr> + std::unique_ptr operator()(column_view const& lists_entries, + rmm::cuda_stream_view) const noexcept { - CUDF_FAIL("Cannot operate on a type that is not floating-point."); + // For non floating point type and non struct, just return a copy of the input. + return std::make_unique(lists_entries); } template >* = nullptr> - void operator()(column_view const& lists_entries, - mutable_column_view const& new_entries, - rmm::cuda_stream_view stream) const noexcept + std::unique_ptr operator()(column_view const& lists_entries, + rmm::cuda_stream_view stream) const noexcept { - // Do not care whether an entry is null or not, just consider it as a floating-point value - thrust::transform(rmm::exec_policy(stream), - lists_entries.begin(), - lists_entries.end(), - new_entries.begin(), - detail::replace_negative_nans{}); - } -}; + auto new_entries = cudf::detail::allocate_like( + lists_entries, lists_entries.size(), cudf::mask_allocation_policy::NEVER, stream); + new_entries->set_null_mask(cudf::detail::copy_bitmask(lists_entries, stream), + lists_entries.null_count()); -/** - * @brief Transform a given lists column to a new lists column in which all the list entries holding - * -NaN value are replaced by (positive) NaN - */ -std::unique_ptr replace_negative_nans_entries(column_view const& lists_entries, - lists_column_view const& lists_column, - rmm::cuda_stream_view stream) -{ - auto new_offsets = std::make_unique(lists_column.offsets()); - auto new_entries = std::make_unique(lists_entries); + // Replace all negative NaN values. + thrust::transform(rmm::exec_policy(stream), + lists_entries.template begin(), + lists_entries.template end(), + new_entries->mutable_view().template begin(), + detail::replace_negative_nans_fn{}); - type_dispatcher(lists_entries.type(), - detail::replace_negative_nans_fn{}, - lists_entries, - new_entries->mutable_view(), - stream); + return new_entries; + } - return make_lists_column( - lists_column.size(), - std::move(new_offsets), - std::move(new_entries), - lists_column.null_count(), - cudf::detail::copy_bitmask( - lists_column.parent(), stream, rmm::mr::get_current_device_resource())); -} + template >* = nullptr> + std::unique_ptr operator()(column_view const& lists_entries, + rmm::cuda_stream_view stream) const noexcept + { + std::vector> output_struct_members; + std::transform( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(lists_entries.num_children()), + std::back_inserter(output_struct_members), + [structs_view = structs_column_view{lists_entries}, stream](auto const child_idx) { + auto const col = structs_view.get_sliced_child(child_idx); + return type_dispatcher(col.type(), detail::replace_negative_nans_dispatch{}, col, stream); + }); + + return cudf::make_structs_column(lists_entries.size(), + std::move(output_struct_members), + lists_entries.null_count(), + cudf::detail::copy_bitmask(lists_entries, stream), + stream); + } +}; /** - * @brief Generate a 0-based offset column for a lists column + * @brief Generate a 0-based offset column for a lists column. * * Given a lists_column_view, which may have a non-zero offset, generate a new column containing * 0-based list offsets. This is done by subtracting each of the input list offset by the first @@ -143,11 +180,10 @@ std::unique_ptr replace_negative_nans_entries(column_view const& lists_e * then output_offsets = { 0, 4, 6, 10 } * @endcode * - * @param lists_column The input lists column - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device resource used to allocate memory - * - * @return A column containing 0-based list offsets + * @param lists_column The input lists column. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device resource used to allocate memory. + * @return A column containing 0-based list offsets. */ std::unique_ptr generate_clean_offsets(lists_column_view const& lists_column, rmm::cuda_stream_view stream, @@ -168,7 +204,35 @@ std::unique_ptr generate_clean_offsets(lists_column_view const& lists_co } /** - * @brief Populate list offsets for all list entries + * @brief Transform a given lists column to a new lists column in which all the list entries holding + * -NaN value are replaced by (positive) NaN. + * + * Replacing -NaN by NaN is necessary before sorting (individual) lists because the sorting API is + * using radix sort, which compares bits of the number thus it may separate -NaN by NaN to the two + * ends of the result column. + */ +std::unique_ptr replace_negative_nans_entries(column_view const& lists_entries, + lists_column_view const& lists_column, + rmm::cuda_stream_view stream) +{ + // We need to copy the offsets column of the input lists_column. Since the input lists_column may + // be sliced, we need to generate clean offsets (i.e., offsets starting from zero). + auto new_offsets = + generate_clean_offsets(lists_column, stream, rmm::mr::get_current_device_resource()); + auto new_entries = type_dispatcher( + lists_entries.type(), detail::replace_negative_nans_dispatch{}, lists_entries, stream); + + return make_lists_column( + lists_column.size(), + std::move(new_offsets), + std::move(new_entries), + lists_column.null_count(), + cudf::detail::copy_bitmask( + lists_column.parent(), stream, rmm::mr::get_current_device_resource())); +} + +/** + * @brief Populate list offsets for all list entries. * * Given an `offsets` column_view containing offsets of a lists column and a number of all list * entries in the column, generate an array that maps from each list entry to the offset of the list @@ -179,12 +243,11 @@ std::unique_ptr generate_clean_offsets(lists_column_view const& lists_co * output = { 1, 1, 1, 1, 2, 2, 3, 3, 3, 3 } * @endcode * - * @param num_entries The number of list entries - * @param offsets Column view to the list offsets - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device resource used to allocate memory - * - * @return A column containing entry list offsets + * @param num_entries The number of list entries. + * @param offsets Column view to the list offsets. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device resource used to allocate memory. + * @return A column containing entry list offsets. */ std::unique_ptr generate_entry_list_offsets(size_type num_entries, column_view const& offsets, @@ -205,95 +268,172 @@ std::unique_ptr generate_entry_list_offsets(size_type num_entries, } /** - * @brief Performs an equality comparison between two entries in a lists column + * @brief Performs an equality comparison between two entries in a lists column. * - * For the two elements that are in the same list in the lists column, they will always be - * considered as different. If they are from the same list and their type is one of floating - * point types, this functor will return the same comparison result as - * `cudf::element_equality_comparator`. + * For the two elements that are NOT in the same list in the lists column, they will always be + * considered as different. If they are from the same list and their type is not floating point, + * this functor will return the same comparison result as `cudf::element_equality_comparator`. * * For floating-point types, entries holding NaN value can be considered as different values or the - * same value depending on the nans_equal parameter. + * same value depending on the `nans_equal` parameter. * - * @tparam Type The data type of entries + * @tparam Type The data type of entries * @tparam nans_equal Flag to specify whether NaN entries should be considered as equal value (only * applicable for floating-point data column) */ -template -class list_entry_comparator { - public: - list_entry_comparator(offset_type const* list_offsets, - column_device_view d_view, - null_equality nulls_equal, - bool has_nulls) - : list_offsets(list_offsets), d_view{d_view}, nulls_equal{nulls_equal}, has_nulls(has_nulls) +template +struct column_row_comparator_fn { + offset_type const* const list_offsets; + column_device_view const lhs; + column_device_view const rhs; + null_equality const nulls_equal; + bool const has_nulls; + bool const nans_equal; + + __host__ __device__ column_row_comparator_fn(offset_type const* const list_offsets, + column_device_view const& lhs, + column_device_view const& rhs, + null_equality const nulls_equal, + bool const has_nulls, + bool const nans_equal) + : list_offsets(list_offsets), + lhs(lhs), + rhs(rhs), + nulls_equal(nulls_equal), + has_nulls(has_nulls), + nans_equal(nans_equal) { } - template - std::enable_if_t and nans_equal_, bool> __device__ - operator()(size_type i, size_type j) const noexcept + template >* = nullptr> + bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept { - // Two entries are not considered for equality if they belong to different lists - if (list_offsets[i] != list_offsets[j]) { return false; } + return lhs_val == rhs_val; + } - if (has_nulls) { - bool const nullable = d_view.nullable(); - bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)}; - bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)}; - if (lhs_is_null and rhs_is_null) { - return nulls_equal == null_equality::EQUAL; - } else if (lhs_is_null != rhs_is_null) { - return false; - } - } + template >* = nullptr> + bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept + { + // If both element(i) and element(j) are NaNs and nans are considered as equal value then this + // comparison will return `true`. This is the desired behavior in Pandas. + if (nans_equal && std::isnan(lhs_val) && std::isnan(rhs_val)) { return true; } - // For floating-point types, if both element(i) and element(j) are NaNs then this comparison - // will return `true`. This is the desired behavior in Pandas. - auto const lhs = d_view.element(i); - auto const rhs = d_view.element(j); - if (std::isnan(lhs) and std::isnan(rhs)) { return true; } - return lhs == rhs; + // If nans are considered as NOT equal, even both element(i) and element(j) are NaNs this + // comparison will still return `false`. This is the desired behavior in Apache Spark. + return lhs_val == rhs_val; } - template - std::enable_if_t or not nans_equal_, bool> __device__ - operator()(size_type i, size_type j) const noexcept + bool __device__ operator()(size_type i, size_type j) const noexcept { - // Two entries are not considered for equality if they belong to different lists + // Two entries are not considered for equality if they belong to different lists. if (list_offsets[i] != list_offsets[j]) { return false; } if (has_nulls) { - bool const nullable = d_view.nullable(); - bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)}; - bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)}; - if (lhs_is_null and rhs_is_null) { + bool const lhs_is_null{lhs.nullable() && lhs.is_null_nocheck(i)}; + bool const rhs_is_null{rhs.nullable() && rhs.is_null_nocheck(j)}; + if (lhs_is_null && rhs_is_null) { return nulls_equal == null_equality::EQUAL; } else if (lhs_is_null != rhs_is_null) { return false; } } - // For floating-point types, if both element(i) and element(j) are NaNs then this comparison - // will return `false`. This is the desired behavior in Apache Spark. - return d_view.element(i) == d_view.element(j); + return compare(lhs.element(i), lhs.element(j)); + } +}; + +/** + * @brief Struct used in type_dispatcher for comparing two entries in a lists column. + */ +struct column_row_comparator_dispatch { + offset_type const* const list_offsets; + column_device_view const lhs; + column_device_view const rhs; + null_equality const nulls_equal; + bool const has_nulls; + bool const nans_equal; + + __device__ column_row_comparator_dispatch(offset_type const* const list_offsets, + column_device_view const& lhs, + column_device_view const& rhs, + null_equality const nulls_equal, + bool const has_nulls, + bool const nans_equal) + : list_offsets(list_offsets), + lhs(lhs), + rhs(rhs), + nulls_equal(nulls_equal), + has_nulls(has_nulls), + nans_equal(nans_equal) + { + } + + template ()>* = nullptr> + bool __device__ operator()(size_type i, size_type j) const noexcept + { + return column_row_comparator_fn{ + list_offsets, lhs, rhs, nulls_equal, has_nulls, nans_equal}(i, j); + } + + template ()>* = nullptr> + bool operator()(size_type i, size_type j) const + { + CUDF_FAIL( + "`column_row_comparator_dispatch` cannot operate on types that are not equally comparable."); } +}; - private: - offset_type const* list_offsets; - column_device_view d_view; - null_equality nulls_equal; - bool has_nulls; +/** + * @brief Performs an equality comparison between rows of two tables using `column_row_comparator` + * to compare rows of their corresponding columns. + */ +struct table_row_comparator_fn { + offset_type const* const list_offsets; + table_device_view const lhs; + table_device_view const rhs; + null_equality const nulls_equal; + bool const has_nulls; + bool const nans_equal; + + table_row_comparator_fn(offset_type const* const list_offsets, + table_device_view const& lhs, + table_device_view const& rhs, + null_equality const nulls_equal, + bool const has_nulls, + bool const nans_equal) + : list_offsets(list_offsets), + lhs(lhs), + rhs(rhs), + nulls_equal(nulls_equal), + has_nulls(has_nulls), + nans_equal(nans_equal) + { + } + + bool __device__ operator()(size_type i, size_type j) const noexcept + { + auto column_comp = [=](column_device_view const& lhs, column_device_view const& rhs) { + return type_dispatcher( + lhs.type(), + column_row_comparator_dispatch{list_offsets, lhs, rhs, nulls_equal, has_nulls, nans_equal}, + i, + j); + }; + + return thrust::equal(thrust::seq, lhs.begin(), lhs.end(), rhs.begin(), column_comp); + } }; /** - * @brief Construct type-dispatched function object for copying indices of the list entries - * ignoring duplicates + * @brief Struct used in type_dispatcher for copying indices of the list entries ignoring + * duplicates. */ -struct get_unique_entries_fn { - template ()>* = nullptr> +struct get_unique_entries_dispatch { + template () && + !std::is_same_v>* = nullptr> offset_type* operator()(offset_type const*, - column_device_view&, + column_view const&, size_type, offset_type*, null_equality, @@ -301,12 +441,13 @@ struct get_unique_entries_fn { bool, rmm::cuda_stream_view) const { - CUDF_FAIL("Cannot operate on types that are not equally comparable."); + CUDF_FAIL( + "`get_unique_entries_dispatch` cannot operate on types that are not equally comparable."); } template ()>* = nullptr> offset_type* operator()(offset_type const* list_offsets, - column_device_view& d_view, + column_view const& all_lists_entries, size_type num_entries, offset_type* output_begin, null_equality nulls_equal, @@ -314,41 +455,69 @@ struct get_unique_entries_fn { bool has_nulls, rmm::cuda_stream_view stream) const noexcept { - if (nans_equal == nan_equality::ALL_EQUAL) { - list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; - return thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp); - } else { - list_entry_comparator const comp{list_offsets, d_view, nulls_equal, has_nulls}; - return thrust::unique_copy(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - output_begin, - comp); - } + auto const d_view = column_device_view::create(all_lists_entries, stream); + auto const comp = column_row_comparator_fn{list_offsets, + *d_view, + *d_view, + nulls_equal, + has_nulls, + nans_equal == nan_equality::ALL_EQUAL}; + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); + } + + template >* = nullptr> + offset_type* operator()(offset_type const* list_offsets, + column_view const& all_lists_entries, + size_type num_entries, + offset_type* output_begin, + null_equality nulls_equal, + nan_equality nans_equal, + bool has_nulls, + rmm::cuda_stream_view stream) const noexcept + { + auto const entries_tview = table_view{{all_lists_entries}}; + auto const flatten_nullability = has_nested_nulls(entries_tview) + ? structs::detail::column_nullability::FORCE + : structs::detail::column_nullability::MATCH_INCOMING; + auto const entries_flattened = cudf::structs::detail::flatten_nested_columns( + entries_tview, {order::ASCENDING}, {null_order::AFTER}, flatten_nullability); + auto const d_view = table_device_view::create(std::get<0>(entries_flattened), stream); + + auto const comp = table_row_comparator_fn{list_offsets, + *d_view, + *d_view, + nulls_equal, + has_nulls, + nans_equal == nan_equality::ALL_EQUAL}; + + return thrust::unique_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + output_begin, + comp); } }; /** - * @brief Copy list entries and entry list offsets ignoring duplicates + * @brief Copy list entries and entry list offsets ignoring duplicates. * * Given an array of all entries flattened from a list column and an array that maps each entry to * the offset of the list containing that entry, those entries and list offsets are copied into * new arrays such that the duplicated entries within each list will be ignored. * - * @param all_lists_entries The input array containing all list entries - * @param entries_list_offsets A map from list entries to their corresponding list offsets - * @param nulls_equal Flag to specify whether null entries should be considered equal - * @param nans_equal Flag to specify whether NaN entries should be considered as equal - * value (only applicable for floating-point data column) - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device resource used to allocate memory - * + * @param all_lists_entries The input array containing all list entries. + * @param entries_list_offsets A map from list entries to their corresponding list offsets. + * @param nulls_equal Flag to specify whether null entries should be considered equal. + * @param nans_equal Flag to specify whether NaN entries should be considered equal + * (only applicable for floating-point data column). + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device resource used to allocate memory. * @return A pair of columns, the first one contains unique list entries and the second one - * contains their corresponding list offsets + * contains their corresponding list offsets. */ std::vector> get_unique_entries_and_list_offsets( column_view const& all_lists_entries, @@ -358,16 +527,15 @@ std::vector> get_unique_entries_and_list_offsets( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const num_entries = all_lists_entries.size(); - auto const d_view_entries = column_device_view::create(all_lists_entries, stream); + auto const num_entries = all_lists_entries.size(); - // Allocate memory to store the indices of the unique entries + // Allocate memory to store the indices of the unique entries. auto unique_indices = rmm::device_uvector(num_entries, stream); auto const output_begin = unique_indices.begin(); auto const output_end = type_dispatcher(all_lists_entries.type(), - get_unique_entries_fn{}, + get_unique_entries_dispatch{}, entries_list_offsets.begin(), - *d_view_entries, + all_lists_entries, num_entries, output_begin, nulls_equal, @@ -375,9 +543,9 @@ std::vector> get_unique_entries_and_list_offsets( all_lists_entries.has_nulls(), stream); - // Collect unique entries and entry list offsets + // Collect unique entries and entry list offsets. // The new null_count and bitmask of the unique entries will also be generated - // by the gather function + // by the gather function. return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, output_begin, output_end, @@ -388,27 +556,27 @@ std::vector> get_unique_entries_and_list_offsets( } /** - * @brief Generate list offsets from entry offsets + * @brief Generate list offsets from entry offsets. * - * Generate an array of list offsets for the final result lists column. The list - * offsets of the original lists column are also taken into account to make sure the result lists - * column will have the same empty list rows (if any) as in the original lists column. + * Generate an array of list offsets for the final result lists column. The list offsets of the + * original lists column are also taken into account to make sure the result lists column will have + * the same empty list rows (if any) as in the original lists column. * - * @param[in] num_entries The number of unique entries after removing duplicates - * @param[in] entries_list_offsets The mapping from list entries to their list offsets - * @param[out] original_offsets The list offsets of the original lists column, which - * will also be used to store the new list offsets - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * @param[in] mr Device resource used to allocate memory + * @param num_entries The number of unique entries after removing duplicates. + * @param entries_list_offsets The mapping from list entries to their list offsets. + * @param original_offsets The list offsets of the original lists column, which will also be used to + * store the new list offsets. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device resource used to allocate memory. */ void generate_offsets(size_type num_entries, column_view const& entries_list_offsets, mutable_column_view const& original_offsets, rmm::cuda_stream_view stream) { - // Firstly, generate temporary list offsets for the unique entries, ignoring empty lists (if any) + // Firstly, generate temporary list offsets for the unique entries, ignoring empty lists (if any). // If entries_list_offsets = {1, 1, 1, 1, 2, 3, 3, 3, 4, 4 }, num_entries = 10, - // then new_offsets = { 0, 4, 5, 8, 10 } + // then new_offsets = { 0, 4, 5, 8, 10 }. auto const new_offsets = allocate_like( original_offsets, mask_allocation_policy::NEVER, rmm::mr::get_current_device_resource()); thrust::copy_if(rmm::exec_policy(stream), @@ -421,10 +589,9 @@ void generate_offsets(size_type num_entries, }); // Generate a prefix sum of number of empty lists, storing inplace to the original lists - // offsets + // offsets. // If the original list offsets is { 0, 0, 5, 5, 6, 6 } (there are 2 empty lists), - // and new_offsets = { 0, 4, 6 }, - // then output = { 0, 1, 1, 2, 2, 3} + // and new_offsets = { 0, 4, 6 }, then output = { 0, 1, 1, 2, 2, 3}. auto const iter_trans_begin = cudf::detail::make_counting_transform_iterator( 0, [offsets = original_offsets.begin()] __device__(auto i) { return (i > 0 && offsets[i] == offsets[i - 1]) ? 1 : 0; @@ -434,10 +601,10 @@ void generate_offsets(size_type num_entries, iter_trans_begin + original_offsets.size(), original_offsets.begin()); - // Generate the final list offsets + // Generate the final list offsets. // If the original list offsets are { 0, 0, 5, 5, 6, 6 }, the new offsets are { 0, 4, 6 }, - // and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 }, - // then output = { 0, 0, 4, 4, 5, 5 } + // and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 }, + // then output = { 0, 0, 4, 4, 5, 5 }. thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(original_offsets.size()), @@ -453,7 +620,7 @@ void generate_offsets(size_type num_entries, /** * @copydoc cudf::lists::drop_list_duplicates * - * @param stream CUDA stream used for device memory operations and kernel launches + * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, null_equality nulls_equal, @@ -462,22 +629,23 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu rmm::mr::device_memory_resource* mr) { if (lists_column.is_empty()) return cudf::empty_like(lists_column.parent()); - if (cudf::is_nested(lists_column.child().type())) { - CUDF_FAIL("Nested types are not supported in drop_list_duplicates."); + if (auto const child_type = lists_column.child().type(); + cudf::is_nested(child_type) && child_type.id() != type_id::STRUCT) { + CUDF_FAIL("Nested types other than STRUCT are not supported in `drop_list_duplicates`."); } - // Flatten all entries (depth = 1) of the lists column + // Flatten all entries (depth = 1) of the lists column. auto const lists_entries = lists_column.get_sliced_child(stream); - // sorted_lists will store the results of the original lists after calling segmented_sort + // sorted_lists will store the results of the original lists after calling segmented_sort. auto const sorted_lists = [&]() { // If nans_equal == ALL_EQUAL and the column contains lists of floating-point data type, - // we need to replace -NaN by NaN before sorting + // we need to replace -NaN by NaN before sorting. auto const replace_negative_nan = - nans_equal == nan_equality::ALL_EQUAL and - type_dispatcher(lists_entries.type(), detail::has_negative_nans_fn{}, lists_entries, stream); + nans_equal == nan_equality::ALL_EQUAL && + type_dispatcher( + lists_entries.type(), detail::has_negative_nans_dispatch{}, lists_entries, stream); if (replace_negative_nan) { - // The column new_lists_column is temporary, thus we will not pass in `mr` auto const new_lists_column = detail::replace_negative_nans_entries(lists_entries, lists_column, stream); return detail::sort_lists( @@ -490,28 +658,28 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu auto const sorted_lists_entries = lists_column_view(sorted_lists->view()).get_sliced_child(stream); - // Generate a 0-based offset column + // Generate a 0-based offset column. auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr); - // Generate a mapping from list entries to offsets of the lists containing those entries + // Generate a mapping from list entries to offsets of the lists containing those entries. auto const entries_list_offsets = detail::generate_entry_list_offsets(sorted_lists_entries.size(), lists_offsets->view(), stream); - // Copy non-duplicated entries (along with their list offsets) to new arrays + // Copy non-duplicated entries (along with their list offsets) to new arrays. auto unique_entries_and_list_offsets = detail::get_unique_entries_and_list_offsets( sorted_lists_entries, entries_list_offsets->view(), nulls_equal, nans_equal, stream, mr); - // Generate offsets for the new lists column + // Generate offsets for the new lists column. detail::generate_offsets(unique_entries_and_list_offsets.front()->size(), unique_entries_and_list_offsets.back()->view(), lists_offsets->mutable_view(), stream); - // Construct a new lists column without duplicated entries + // Construct a new lists column without duplicated entries. // Reuse the null_count and bitmask of the lists_column: those are the null information for - // the list elements (rows) + // the list elements (rows). // For the entries of those lists (rows), their null_count and bitmask were generated separately - // during the step `get_unique_entries_and_list_offsets` above + // during the step `get_unique_entries_and_list_offsets` above. return make_lists_column(lists_column.size(), std::move(lists_offsets), std::move(unique_entries_and_list_offsets.front()), diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index bc413fd220a..270e01075b9 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -14,61 +14,65 @@ * limitations under the License. */ +#include + #include #include +#include #include #include -#include #include #include -using int_type = int32_t; -using float_type = float; - -using LIST_COL_FLT = cudf::test::lists_column_wrapper; -using LIST_COL_STR = cudf::test::lists_column_wrapper; +using namespace cudf::test::iterators; -auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); -auto constexpr neg_Inf = -std::numeric_limits::infinity(); -auto constexpr NaN = std::numeric_limits::quiet_NaN(); -auto constexpr Inf = std::numeric_limits::infinity(); +using float_type = float; +using FloatListsCol = cudf::test::lists_column_wrapper; +using StrListsCol = cudf::test::lists_column_wrapper; +using StringsCol = cudf::test::strings_column_wrapper; +using StructsCol = cudf::test::structs_column_wrapper; +using IntsCol = cudf::test::fixed_width_column_wrapper; +using FloatsCol = cudf::test::fixed_width_column_wrapper; -template -void test_once(cudf::column_view const& input, - LCW const& expected, - cudf::null_equality nulls_equal = cudf::null_equality::EQUAL) -{ - auto const results = - cudf::lists::drop_list_duplicates(cudf::lists_column_view{input}, nulls_equal); - if (cudf::is_floating_point(input.type())) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); - } else { - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); - } -} +auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); +auto constexpr neg_Inf = -std::numeric_limits::infinity(); +auto constexpr NaN = std::numeric_limits::quiet_NaN(); +auto constexpr Inf = std::numeric_limits::infinity(); +auto constexpr verbosity = cudf::test::debug_output_level::FIRST_ERROR; struct DropListDuplicatesTest : public cudf::test::BaseFixture { }; TEST_F(DropListDuplicatesTest, FloatingPointTestsWithSignedZero) { - // -0.0 and 0.0 should be considered equal - test_once(LIST_COL_FLT{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0}, - LIST_COL_FLT{0, 1, 2}); + // -0.0 and 0.0 should be considered equal. + auto const lists = FloatListsCol{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0}; + auto const expected = FloatListsCol{0, 1, 2}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); } TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf) { - // Lists contain inf - test_once(LIST_COL_FLT{0, 1, 2, 0, 1, 2, 0, 1, 2, Inf, Inf, Inf}, LIST_COL_FLT{0, 1, 2, Inf}); - test_once(LIST_COL_FLT{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf}, - LIST_COL_FLT{neg_Inf, 0, Inf}); + // Lists contain inf. + { + auto const lists = FloatListsCol{0, 1, 2, 0, 1, 2, 0, 1, 2, Inf, Inf, Inf}; + auto const expected = FloatListsCol{0, 1, 2, Inf}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + { + auto const lists = FloatListsCol{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf}; + auto const expected = FloatListsCol{neg_Inf, 0, Inf}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } } // The position of NaN is undefined after sorting, thus we need to offload the data to CPU to -// check for validity +// check for validity. // We will not store NaN in the results_expected variable (an unordered_set) because we can't check // for NaN existence in a set. Instead, we will count the number of NaNs in the input and compare // with the number of NaNs in the output. @@ -77,14 +81,14 @@ static void test_floating_point(std::vector const& h_input, cudf::nan_equality nans_equal) { // If NaNs are considered as equal value, the final result should always contain at max ONE NaN - // entry per list + // entry per list. std::size_t const num_NaNs = nans_equal == cudf::nan_equality::ALL_EQUAL ? std::size_t{1} : std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); }); auto const results_col = cudf::lists::drop_list_duplicates( - cudf::lists_column_view{LIST_COL_FLT(h_input.begin(), h_input.end())}, + cudf::lists_column_view{FloatListsCol(h_input.begin(), h_input.end())}, cudf::null_equality::EQUAL, nans_equal); auto const results_arr = @@ -125,130 +129,479 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs) TEST_F(DropListDuplicatesTest, StringTestsNonNull) { - // Trivial cases - test_once(LIST_COL_STR{{}}, LIST_COL_STR{{}}); - test_once(LIST_COL_STR{"this", "is", "a", "string"}, LIST_COL_STR{"a", "is", "string", "this"}); - - // One list column - test_once(LIST_COL_STR{"this", "is", "is", "is", "a", "string", "string"}, - LIST_COL_STR{"a", "is", "string", "this"}); - - // Multiple lists column - test_once( - LIST_COL_STR{LIST_COL_STR{"this", "is", "a", "no duplicate", "string"}, - LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}, - LIST_COL_STR{"this", "is", "is", "is", "a", "two duplicates", "string"}, - LIST_COL_STR{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, - LIST_COL_STR{LIST_COL_STR{"a", "is", "no duplicate", "string", "this"}, - LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}, - LIST_COL_STR{"a", "is", "string", "this", "two duplicates"}, - LIST_COL_STR{"a", "is", "string", "this", "three duplicates"}}); + // Trivial cases - empty input. + { + auto const lists = StrListsCol{{}}; + auto const expected = StrListsCol{{}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // No duplicate entry. + { + auto const lists = StrListsCol{"this", "is", "a", "string"}; + auto const expected = StrListsCol{"a", "is", "string", "this"}; + auto const results = cudf::lists::drop_list_duplicates( + cudf::lists_column_view{lists}, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // One list column. + { + auto const lists = StrListsCol{"this", "is", "is", "is", "a", "string", "string"}; + auto const expected = StrListsCol{"a", "is", "string", "this"}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // One list column, input is a strings column with given non-default null_equality and + // nans_equality parameters. + { + auto const lists = StrListsCol{"this", "is", "is", "is", "a", "string", "string"}; + auto const expected = StrListsCol{"a", "is", "string", "this"}; + auto const results = cudf::lists::drop_list_duplicates( + cudf::lists_column_view{lists}, cudf::null_equality::UNEQUAL, cudf::nan_equality::ALL_EQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // Multiple lists column. + { + auto const lists = + StrListsCol{StrListsCol{"this", "is", "a", "no duplicate", "string"}, + StrListsCol{"this", "is", "is", "a", "one duplicate", "string"}, + StrListsCol{"this", "is", "is", "is", "a", "two duplicates", "string"}, + StrListsCol{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}; + auto const expected = StrListsCol{StrListsCol{"a", "is", "no duplicate", "string", "this"}, + StrListsCol{"a", "is", "one duplicate", "string", "this"}, + StrListsCol{"a", "is", "string", "this", "two duplicates"}, + StrListsCol{"a", "is", "string", "this", "three duplicates"}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } } TEST_F(DropListDuplicatesTest, StringTestsWithNulls) { auto const null = std::string(""); - // One list column with null entries - test_once( - LIST_COL_STR{{"this", null, "is", "is", "is", "a", null, "string", null, "string"}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 1 && i != 6 && i != 8; })}, - LIST_COL_STR{{"a", "is", "string", "this", null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}); + // One list column with null entries. + { + auto const lists = StrListsCol{ + {"this", null, "is", "is", "is", "a", null, "string", null, "string"}, nulls_at({1, 6, 8})}; + auto const expected = StrListsCol{{"a", "is", "string", "this", null}, null_at(4)}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } // Multiple lists column with null lists and null entries - test_once( - LIST_COL_STR{ - {LIST_COL_STR{ - {"this", null, "is", null, "a", null, "no duplicate", null, "string"}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; })}, - LIST_COL_STR{}, - LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}, - LIST_COL_STR{{LIST_COL_STR{{"a", "is", "no duplicate", "string", "this", null}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i <= 4; })}, - LIST_COL_STR{}, - LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}); + { + auto const lists = StrListsCol{ + {StrListsCol{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, + nulls_at({1, 3, 5, 7})}, + StrListsCol{}, /* NULL */ + StrListsCol{"this", "is", "is", "a", "one duplicate", "string"}}, + null_at(1)}; + auto const expected = + StrListsCol{{StrListsCol{{"a", "is", "no duplicate", "string", "this", null}, null_at(5)}, + StrListsCol{}, /* NULL */ + StrListsCol{"a", "is", "one duplicate", "string", "this"}}, + null_at(1)}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } } template struct DropListDuplicatesTypedTest : public cudf::test::BaseFixture { }; -#define LIST_COL cudf::test::lists_column_wrapper using TypesForTest = cudf::test::Concat; -TYPED_TEST_CASE(DropListDuplicatesTypedTest, TypesForTest); +TYPED_TEST_SUITE(DropListDuplicatesTypedTest, TypesForTest); TYPED_TEST(DropListDuplicatesTypedTest, InvalidInputTests) { - // Lists of nested types are not supported + using ListsCol = cudf::test::lists_column_wrapper; + + // Nested types (except struct) are not supported. EXPECT_THROW( - cudf::lists::drop_list_duplicates(cudf::lists_column_view{LIST_COL{LIST_COL{{1, 2}, {3}}}}), + cudf::lists::drop_list_duplicates(cudf::lists_column_view{ListsCol{ListsCol{{1, 2}, {3}}}}), cudf::logic_error); } TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests) { - // Empty input - test_once(LIST_COL{{}}, LIST_COL{{}}); + using ListsCol = cudf::test::lists_column_wrapper; + + // Empty input. + { + auto const lists = ListsCol{{}}; + auto const expected = ListsCol{{}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } - // Trivial cases - test_once(LIST_COL{0, 1, 2, 3, 4, 5}, LIST_COL{0, 1, 2, 3, 4, 5}); + // Trivial cases. + { + auto const lists = ListsCol{0, 1, 2, 3, 4, 5}; + auto const expected = ListsCol{0, 1, 2, 3, 4, 5}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } - // Multiple empty lists - test_once(LIST_COL{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, - LIST_COL{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + // Multiple empty lists. + { + auto const lists = ListsCol{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}; + auto const expected = ListsCol{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } } TYPED_TEST(DropListDuplicatesTypedTest, NonNullInputTests) { - // Adjacent lists containing the same entries - test_once(LIST_COL{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}, - LIST_COL{{1}, {1, 2}, {2, 3}}); - - // Sliced list column - auto const list0 = - LIST_COL{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; - auto const list1 = cudf::slice(list0, {0, 5})[0]; - auto const list2 = cudf::slice(list0, {1, 5})[0]; - auto const list3 = cudf::slice(list0, {1, 3})[0]; - auto const list4 = cudf::slice(list0, {0, 3})[0]; - - test_once(list0, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list1, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list2, LIST_COL{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); - test_once(list3, LIST_COL{{1, 2, 3, 4}, {5}}); - test_once(list4, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}}); + using ListsCol = cudf::test::lists_column_wrapper; + + // Adjacent lists containing the same entries. + { + auto const lists = + ListsCol{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}; + auto const expected = ListsCol{{1}, {1, 2}, {2, 3}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // Sliced list column. + auto const lists_original = + ListsCol{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; + auto const lists1 = cudf::slice(lists_original, {0, 5})[0]; + auto const lists2 = cudf::slice(lists_original, {1, 5})[0]; + auto const lists3 = cudf::slice(lists_original, {1, 3})[0]; + auto const lists4 = cudf::slice(lists_original, {0, 3})[0]; + + { + auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists_original}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + { + auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + { + auto const expected = ListsCol{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists2}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + { + auto const expected = ListsCol{{1, 2, 3, 4}, {5}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + { + auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists4}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } } TYPED_TEST(DropListDuplicatesTypedTest, WithNullInputTests) { + using ListsCol = cudf::test::lists_column_wrapper; auto constexpr null = TypeParam{0}; - // null lists - test_once(LIST_COL{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 2 && i != 3; })}, - LIST_COL{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i != 2 && i != 3; })}); - - // null entries are equal - test_once( - LIST_COL{std::initializer_list{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - LIST_COL{std::initializer_list{1, 3, 5, 7, 9, null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 5; })}); - - // nulls entries are not equal - test_once( - LIST_COL{std::initializer_list{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, - LIST_COL{std::initializer_list{1, 3, 5, 7, 9, null, null, null, null, null}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 5; })}, - cudf::null_equality::UNEQUAL); + // null lists. + { + auto const lists = ListsCol{ + {{3, 2, 1, 4, 1}, {5}, {} /*NULL*/, {} /*NULL*/, {10, 8, 9}, {6, 7}}, nulls_at({2, 3})}; + auto const expected = + ListsCol{{{1, 2, 3, 4}, {5}, {} /*NULL*/, {} /*NULL*/, {8, 9, 10}, {6, 7}}, nulls_at({2, 3})}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // null entries are equal. + { + auto const lists = ListsCol{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})}; + auto const expected = + ListsCol{std::initializer_list{1, 3, 5, 7, 9, null}, null_at(5)}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // nulls entries are not equal. + { + auto const lists = ListsCol{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})}; + auto const expected = + ListsCol{std::initializer_list{1, 3, 5, 7, 9, null, null, null, null, null}, + nulls_at({5, 6, 7, 8, 9})}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}, + cudf::null_equality::UNEQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } +} + +TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsNoNull) +{ + using ColWrapper = cudf::test::fixed_width_column_wrapper; + + auto const get_structs = [] { + auto child1 = ColWrapper{ + 1, 1, 1, 1, 1, 1, 1, 1, // list1 + 1, 1, 1, 1, 2, 1, 2, 2, // list2 + 2, 2, 2, 2, 3, 2, 3, 3 // list3 + }; + auto child2 = StringsCol{ + // begin list1 + "Banana", + "Mango", + "Apple", + "Cherry", + "Kiwi", + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "Bear", + "Duck", + "Cat", + "Dog", + "Panda", + "Bear", + "Cat", + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "ÁBC", + "XYZ", + "ÁÁÁ", + "ÁBC", + "XYZ" // end list3 + }; + return StructsCol{{child1, child2}}; + }; + + auto const get_structs_expected = [] { + auto child1 = ColWrapper{1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3}; + auto child2 = StringsCol{ + // begin list1 + "Apple", + "Banana", + "Cherry", + "Kiwi", + "Mango", // end list1 + // begin list2 + "Bear", + "Cat", + "Dog", + "Duck", + "Cat", + "Panda", // end list2 + // begin list3 + "ÁBC", + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "XYZ", + "ÁBC" // end list3 + }; + return StructsCol{{child1, child2}}; + }; + + // Test full columns. + { + auto const lists = + cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected = cudf::make_lists_column( + 3, IntsCol{0, 5, 11, 17}.release(), get_structs_expected().release(), 0, {}); + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity); + } + + // Test sliced columns. + { + auto const lists_original = + cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected_original = cudf::make_lists_column( + 3, IntsCol{0, 5, 11, 17}.release(), get_structs_expected().release(), 0, {}); + auto const lists = cudf::slice(lists_original->view(), {1, 3})[0]; + auto const expected = cudf::slice(expected_original->view(), {1, 3})[0]; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } +} + +TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsHaveNull) +{ + using ColWrapper = cudf::test::fixed_width_column_wrapper; + auto constexpr XXX = int32_t{0}; // nulls at the parent structs column level + auto constexpr null = int32_t{0}; // nulls at the children columns level + + auto const get_structs = [] { + auto child1 = ColWrapper{{ + 1, 1, null, XXX, XXX, 1, 1, 1, // list1 + 1, 1, 1, 1, 2, 1, null, 2, // list2 + null, null, 2, 2, 3, 2, 3, 3 // list3 + }, + nulls_at({2, 14, 16, 17})}; + auto child2 = StringsCol{{ + // begin list1 + "Banana", + "Mango", + "Apple", + "XXX", /*NULL*/ + "XXX", /*NULL*/ + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "Bear", + "Duck", + "Cat", + "Dog", + "Panda", + "Bear", + "" /*NULL*/, + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "ÁBC", + "" /*NULL*/, + "ÁÁÁ", + "ÁBC", + "XYZ" // end list3 + }, + nulls_at({14, 20})}; + return StructsCol{{child1, child2}, nulls_at({3, 4})}; + }; + + auto const get_structs_expected = [] { + auto child1 = + ColWrapper{{1, 1, 1, 1, null, XXX, 1, 1, 1, 1, 2, null, 2, 2, 2, 3, 3, 3, null, null}, + nulls_at({4, 5, 11, 18, 19})}; + auto child2 = StringsCol{{ + // begin list1 + "Banana", + "Cherry", + "Kiwi", + "Mango", + "Apple", + "XXX" /*NULL*/, // end list1 + // begin list2 + "Bear", + "Cat", + "Dog", + "Duck", + "Panda", + "" /*NULL*/, // end list2 + // begin list3 + "ÁBC", + "ÁÁÁ", + "ÍÍÍÍÍ", + "XYZ", + "ÁBC", + "" /*NULL*/, + "ÁÁÁ", + "ÉÉÉÉÉ" // end list3 + }, + nulls_at({5, 11, 17})}; + return StructsCol{{child1, child2}, null_at(5)}; + }; + + // Test full columns. + { + auto const lists = + cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected = cudf::make_lists_column( + 3, IntsCol{0, 6, 12, 20}.release(), get_structs_expected().release(), 0, {}); + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity); + } + + // Test sliced columns. + { + auto const lists_original = + cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected_original = cudf::make_lists_column( + 3, IntsCol{0, 6, 12, 20}.release(), get_structs_expected().release(), 0, {}); + auto const lists = cudf::slice(lists_original->view(), {1, 3})[0]; + auto const expected = cudf::slice(expected_original->view(), {1, 3})[0]; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } +} + +TEST_F(DropListDuplicatesTest, SlicedInputListsOfStructsWithNaNs) +{ + auto const h_child = std::vector{ + 0, -1, 1, 0, 2, 0, 1, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; + + auto const get_structs = [&] { + // Two children are just identical. + auto child1 = FloatsCol(h_child.begin(), h_child.end()); + auto child2 = FloatsCol(h_child.begin(), h_child.end()); + return StructsCol{{child1, child2}}; + }; + + // The first list does not have any NaN or -NaN, while the second list has both. + // `drop_list_duplicates` is expected to operate properly on this second list. + auto const lists_original = + cudf::make_lists_column(2, IntsCol{0, 10, 18}.release(), get_structs().release(), 0, {}); + auto const lists2 = cudf::slice(lists_original->view(), {1, 2})[0]; // test on the second list + + // Contain expected values excluding NaN. + auto const results_children_expected = std::unordered_set{0, 1, 2}; + + // Test for cudf::nan_equality::UNEQUAL. + { + auto const results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists2}); + auto const child = cudf::lists_column_view(results_col->view()).child(); + auto const results_arr = cudf::test::to_host(child.child(0)).first; + + std::size_t const num_NaNs = + std::count_if(h_child.begin(), h_child.end(), [](auto x) { return std::isnan(x); }); + EXPECT_EQ(results_arr.size(), results_children_expected.size() + num_NaNs); + + std::size_t NaN_count{0}; + std::unordered_set results; + for (auto const x : results_arr) { + if (std::isnan(x)) { + ++NaN_count; + } else { + results.insert(x); + } + } + EXPECT_TRUE(results_children_expected.size() == results.size() && NaN_count == num_NaNs); + } + + // Test for cudf::nan_equality::ALL_EQUAL. + { + auto const results_col = cudf::lists::drop_list_duplicates( + cudf::lists_column_view{lists2}, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL); + auto const child = cudf::lists_column_view(results_col->view()).child(); + auto const results_arr = cudf::test::to_host(child.child(0)).first; + + std::size_t const num_NaNs = 1; + EXPECT_EQ(results_arr.size(), results_children_expected.size() + num_NaNs); + + std::size_t NaN_count{0}; + std::unordered_set results; + for (auto const x : results_arr) { + if (std::isnan(x)) { + ++NaN_count; + } else { + results.insert(x); + } + } + EXPECT_TRUE(results_children_expected.size() == results.size() && NaN_count == num_NaNs); + } } diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp index c26059ee09b..5631c910753 100644 --- a/cpp/tests/rolling/collect_ops_test.cpp +++ b/cpp/tests/rolling/collect_ops_test.cpp @@ -2168,34 +2168,45 @@ TEST_F(CollectSetTest, BasicRollingWindowWithNaNs) result_with_nan_equal->view()); } -TEST_F(CollectSetTest, ListTypeRollingWindow) +TEST_F(CollectSetTest, StructTypeRollingWindow) { using namespace cudf; using namespace cudf::test; - auto const input_column = lists_column_wrapper{{1, 2, 3}, {4, 5}, {6}, {7, 8, 9}, {10}}; - - auto const prev_column = fixed_width_column_wrapper{1, 2, 2, 2, 2}; - auto const foll_column = fixed_width_column_wrapper{1, 1, 1, 1, 0}; + auto col1 = fixed_width_column_wrapper{1, 2, 3, 4, 5}; + auto col2 = strings_column_wrapper{"a", "b", "c", "d", "e"}; + auto const input_column = cudf::test::structs_column_wrapper{{col1, col2}}; + auto const prev_column = fixed_width_column_wrapper{1, 2, 2, 2, 2}; + auto const foll_column = fixed_width_column_wrapper{1, 1, 1, 1, 0}; - EXPECT_THROW(rolling_window(input_column, - prev_column, - foll_column, - 1, - *make_collect_set_aggregation()), - cudf::logic_error); + auto const expected = [] { + auto child1 = fixed_width_column_wrapper{1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5}; + auto child2 = + strings_column_wrapper{"a", "b", "a", "b", "c", "b", "c", "d", "c", "d", "e", "d", "e"}; + return cudf::make_lists_column( + 5, + fixed_width_column_wrapper{0, 2, 5, 8, 11, 13}.release(), + structs_column_wrapper{{child1, child2}}.release(), + 0, + {}); + }(); + auto const result = rolling_window(input_column, + prev_column, + foll_column, + 1, + *make_collect_set_aggregation()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected->view(), result->view()); } -TEST_F(CollectSetTest, StructTypeRollingWindow) +TEST_F(CollectSetTest, ListTypeRollingWindow) { using namespace cudf; using namespace cudf::test; - auto col1 = fixed_width_column_wrapper{1, 2, 3, 4, 5}; - auto col2 = strings_column_wrapper{"a", "b", "c", "d", "e"}; - auto const input_column = cudf::test::structs_column_wrapper{{col1, col2}}; - auto const prev_column = fixed_width_column_wrapper{1, 2, 2, 2, 2}; - auto const foll_column = fixed_width_column_wrapper{1, 1, 1, 1, 0}; + auto const input_column = lists_column_wrapper{{1, 2, 3}, {4, 5}, {6}, {7, 8, 9}, {10}}; + + auto const prev_column = fixed_width_column_wrapper{1, 2, 2, 2, 2}; + auto const foll_column = fixed_width_column_wrapper{1, 1, 1, 1, 0}; EXPECT_THROW(rolling_window(input_column, prev_column,