diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp index 7f7db131a93..a0a25cb51a2 100644 --- a/cpp/include/cudf/lists/combine.hpp +++ b/cpp/include/cudf/lists/combine.hpp @@ -49,9 +49,8 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW }; * r is now [{0, 1, 8}, {2, 3, 4, 9}, {5}, {10, 11, 12}, {6, 7, 13, 14, 15, 16}] * @endcode * - * @throws cudf::logic_error if any column of the input table is not a lists columns. - * @throws cudf::logic_error if any lists column contains nested typed entry. - * @throws cudf::logic_error if all lists columns do not have the same entry type. + * @throws cudf::logic_error if any column of the input table is not a lists column. + * @throws cudf::logic_error if all lists columns do not have the same type. * * @param input Table of lists to be concatenated. * @param null_policy The parameter to specify whether a null list element will be ignored from diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu index 09f0b653466..58f08217a87 100644 --- a/cpp/src/lists/combine/concatenate_rows.cu +++ b/cpp/src/lists/combine/concatenate_rows.cu @@ -15,22 +15,170 @@ */ #include -#include +#include +#include +#include +#include #include #include -#include -#include -#include +#include #include #include -#include -#include +#include namespace cudf { namespace lists { namespace detail { + +namespace { + +/** + * @brief Generates the new set of offsets that regroups the concatenated-by-column inputs + * into concatenated-by-rows inputs, and the associated null mask. + * + * If we have the following input columns: + * + * s1 = [{0, 1}, {2, 3, 4}, {5}, {}, {6, 7}] + * s2 = [{8}, {9}, {}, {10, 11, 12}, {13, 14, 15, 16}] + * + * We can rearrange the child data using a normal concatenate and a gather such that + * the resulting values are in the correct order. For the above example, the + * child column would look like: + * + * {0, 1, 8, 2, 3, 4, 9, 5, 10, 11, 12, 6, 7, 13, 14, 15} + * + * Because we did a regular concatenate (and a subsequent gather to reorder the rows), + * the top level rows of the list column would look like: + * + * (2N rows) + * [{0, 1}, {8}, {2, 3, 4}, {9}, {5}, {10, 11, 12}, {6, 7}, {13, 14, 15, 16}] + * + * What we really want is: + * + * (N rows) + * [{0, 1, 8}, {2, 3, 4, 9}, {5}, {10, 11, 12}, {6, 7, 13, 14, 15, 16}] + * + * We can do this by recomputing a new offsets column that does this regrouping. + * + */ +std::tuple, rmm::device_buffer, size_type> +generate_regrouped_offsets_and_null_mask(table_device_view const& input, + bool build_null_mask, + concatenate_null_policy null_policy, + device_span row_null_counts, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // outgoing offsets. + auto offsets = cudf::make_fixed_width_column(data_type{type_to_id()}, + input.num_rows() + 1, + mask_state::UNALLOCATED, + stream, + mr); + + auto keys = thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}), + [num_columns = input.num_columns()] __device__( + size_t i) -> size_type { return i / num_columns; }); + + // generate sizes for the regrouped rows + auto values = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + [input, row_null_counts = row_null_counts.data(), null_policy] __device__( + size_t i) -> offset_type { + auto const col_index = i % input.num_columns(); + auto const row_index = i / input.num_columns(); + + // nullify the whole output row + if (row_null_counts) { + if ((null_policy == concatenate_null_policy::NULLIFY_OUTPUT_ROW && + row_null_counts[row_index] > 0) || + (null_policy == concatenate_null_policy::IGNORE && + row_null_counts[row_index] == input.num_columns())) { + return 0; + } + } + auto offsets = + input.column(col_index).child(lists_column_view::offsets_column_index).data() + + input.column(col_index).offset(); + return offsets[row_index + 1] - offsets[row_index]; + }); + + thrust::reduce_by_key(rmm::exec_policy(stream), + keys, + keys + (input.num_rows() * input.num_columns()), + values, + thrust::make_discard_iterator(), + offsets->mutable_view().begin()); + + // convert to offsets + thrust::exclusive_scan(rmm::exec_policy(stream), + offsets->view().begin(), + offsets->view().begin() + input.num_rows() + 1, + offsets->mutable_view().begin(), + 0); + + // generate appropriate null mask + auto [null_mask, null_count] = [&]() { + // if the input doesn't contain nulls, no work to do + if (!build_null_mask) { + return std::pair{rmm::device_buffer{}, 0}; + } + + // row is null if -all- input rows are null + if (null_policy == concatenate_null_policy::IGNORE) { + return cudf::detail::valid_if( + row_null_counts.begin(), + row_null_counts.begin() + input.num_rows(), + [num_columns = input.num_columns()] __device__(size_type null_count) { + return null_count != num_columns; + }, + stream, + mr); + } + + // row is null if -any- input rows are null + return cudf::detail::valid_if( + row_null_counts.begin(), + row_null_counts.begin() + input.num_rows(), + [] __device__(size_type null_count) { return null_count == 0; }, + stream, + mr); + }(); + + return {std::move(offsets), std::move(null_mask), null_count}; +} + +rmm::device_uvector generate_null_counts(table_device_view const& input, + rmm::cuda_stream_view stream) +{ + rmm::device_uvector null_counts(input.num_rows(), stream); + + auto keys = thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}), + [num_columns = input.num_columns()] __device__( + size_t i) -> size_type { return i / num_columns; }); + + auto null_values = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), [input] __device__(size_t i) -> size_type { + auto const col_index = i % input.num_columns(); + auto const row_index = i / input.num_columns(); + auto const& col = input.column(col_index); + return col.null_mask() ? (bit_is_set(col.null_mask(), row_index + col.offset()) ? 0 : 1) : 0; + }); + + thrust::reduce_by_key(rmm::exec_policy(stream), + keys, + keys + (input.num_rows() * input.num_columns()), + null_values, + thrust::make_discard_iterator(), + null_counts.data()); + + return null_counts; +} + +} // anonymous namespace + /** * @copydoc cudf::lists::concatenate_rows * @@ -44,49 +192,104 @@ std::unique_ptr concatenate_rows(table_view const& input, CUDF_EXPECTS(input.num_columns() > 0, "The input table must have at least one column."); auto const entry_type = lists_column_view(*input.begin()).child().type(); - for (auto const& col : input) { - CUDF_EXPECTS(col.type().id() == type_id::LIST, - "All columns of the input table must be of lists column type."); - - auto const child_col = lists_column_view(col).child(); - CUDF_EXPECTS(not cudf::is_nested(child_col.type()), "Nested types are not supported."); - CUDF_EXPECTS(entry_type == child_col.type(), - "The types of entries in the input columns must be the same."); - } + CUDF_EXPECTS( + std::all_of(input.begin(), + input.end(), + [](column_view const& col) { return col.type().id() == cudf::type_id::LIST; }), + "All columns of the input table must be of lists column type."); + CUDF_EXPECTS( + std::all_of(std::next(input.begin()), + input.end(), + [a = *input.begin()](column_view const& b) { return column_types_equal(a, b); }), + "The types of entries in the input columns must be the same."); auto const num_rows = input.num_rows(); auto const num_cols = input.num_columns(); if (num_rows == 0) { return cudf::empty_like(input.column(0)); } if (num_cols == 1) { return std::make_unique(*(input.begin()), stream, mr); } - // Memory resource for temporary data. - auto const default_mr = rmm::mr::get_current_device_resource(); - - // Interleave the input table into one column. - auto const has_null_mask = std::any_of( - std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); }); - auto interleaved_columns = detail::interleave_columns(input, has_null_mask, stream, default_mr); - - // Generate a lists column which has child column is the interleaved_columns. - // The new nested lists column will have each row is a list of `num_cols` list elements. - static_assert(std::is_same_v and std::is_same_v); - auto list_offsets = make_numeric_column( - data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, default_mr); - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows + 1), - list_offsets->mutable_view().template begin(), - [num_cols] __device__(auto const idx) { return idx * num_cols; }); - auto const nested_lists_col = make_lists_column(num_rows, - std::move(list_offsets), - std::move(interleaved_columns), - 0, - rmm::device_buffer{}, - stream, - default_mr); - - // Concatenate lists on each row of the nested lists column, producing the desired output. - return concatenate_list_elements(nested_lists_col->view(), null_policy, stream, mr); + // concatenate the input table into one column. + std::vector cols(input.num_columns()); + std::copy(input.begin(), input.end(), cols.begin()); + auto concat = cudf::detail::concatenate(cols, stream); + + // whether or not we should be generating a null mask at all + auto const build_null_mask = concat->has_nulls(); + + auto input_dv = table_device_view::create(input, stream); + + // if the output needs a null mask, generate a vector of null counts per row of input, where the + // count is the number of columns that contain a null for a given row. + auto row_null_counts = build_null_mask ? generate_null_counts(*input_dv, stream) + : rmm::device_uvector{0, stream}; + + // if we have nulls, overlay an appropriate null mask onto the + // concatenated column so that gather() sanitizes out the child data of rows that will ultimately + // be nullified. + if (build_null_mask) { + auto [null_mask, null_count] = [&]() { + auto iter = thrust::make_counting_iterator(size_t{0}); + + // IGNORE. Output row is nullified if all input rows are null. + if (null_policy == concatenate_null_policy::IGNORE) { + return cudf::detail::valid_if( + iter, + iter + (input.num_rows() * input.num_columns()), + [num_rows = input.num_rows(), + num_columns = input.num_columns(), + row_null_counts = row_null_counts.data()] __device__(size_t i) -> size_type { + auto const row_index = i % num_rows; + return row_null_counts[row_index] != num_columns; + }); + } + // NULLIFY_OUTPUT_ROW. Output row is nullfied if any input row is null + return cudf::detail::valid_if( + iter, + iter + (input.num_rows() * input.num_columns()), + [num_rows = input.num_rows(), + row_null_counts = row_null_counts.data()] __device__(size_t i) -> size_type { + auto const row_index = i % num_rows; + return row_null_counts[row_index] == 0; + }); + }(); + concat->set_null_mask(std::move(null_mask), null_count); + } + + // perform the gather to rearrange the rows in desired child order. this will produce -almost- + // what we want. the data of the children will be exactly what we want, but will be grouped as if + // we had concatenated all the rows together instead of concatenating within the rows. To fix + // this we can simply swap in a new set of offsets that re-groups them. bmo + auto iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + [num_columns = input.num_columns(), + num_rows = input.num_rows()] __device__(size_t i) -> size_type { + auto const src_col_index = i % num_columns; + auto const src_row_index = i / num_columns; + auto const concat_row_index = (src_col_index * num_rows) + src_row_index; + return concat_row_index; + }); + auto gathered = cudf::detail::gather(table_view({*concat}), + iter, + iter + (input.num_columns() * input.num_rows()), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + + // generate regrouped offsets and null mask + auto [offsets, null_mask, null_count] = generate_regrouped_offsets_and_null_mask( + *input_dv, build_null_mask, null_policy, row_null_counts, stream, mr); + + // reassemble the underlying child data with the regrouped offsets and null mask + column& col = gathered->get_column(0); + auto contents = col.release(); + return cudf::make_lists_column( + input.num_rows(), + std::move(offsets), + std::move(contents.children[lists_column_view::child_column_index]), + null_count, + std::move(null_mask), + stream, + mr); } } // namespace detail diff --git a/cpp/tests/lists/combine/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp index 17d31c3e387..ed8bf8abb8d 100644 --- a/cpp/tests/lists/combine/concatenate_rows_tests.cpp +++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,13 +56,6 @@ TEST_F(ListConcatenateRowsTest, InvalidInput) EXPECT_THROW(cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}), cudf::logic_error); } - - // Nested types are not supported - { - auto const col = IntListsCol{{IntListsCol{1, 2, 3}, IntListsCol{4, 5, 6}}}.release(); - EXPECT_THROW(cudf::lists::concatenate_rows(TView{{col->view(), col->view()}}), - cudf::logic_error); - } } template @@ -486,3 +479,504 @@ TEST_F(ListConcatenateRowsTest, StringsColumnsWithEmptyListTest) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity); } + +struct ListConcatenateRowsNestedTypesTest : public cudf::test::BaseFixture { +}; + +TEST_F(ListConcatenateRowsNestedTypesTest, Identity) +{ + // list> + + // col 0 + cudf::test::lists_column_wrapper l0{ + {{{{"whee", "yay", "bananas"}, nulls_at({1})}, {}}, + {{}}, + {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"}}, nulls_at({0, 2})}, + {{"f", "tesla"}}, + {{"phone"}, {"hack", "table", "car"}}}, + nulls_at({3, 4})}; + + // perform the concatenate + cudf::table_view t({l0}); + auto result = cudf::lists::concatenate_rows(t); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, l0); +} + +TEST_F(ListConcatenateRowsNestedTypesTest, List) +{ + // list> + + // col 0 + cudf::test::lists_column_wrapper l0{ + {{"whee", "yay", "bananas"}, {}}, + {{}}, + {{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"}}, + {{"f", "tesla"}}, + {{"phone"}, {"hack", "table", "car"}}}; + + // col1 + cudf::test::lists_column_wrapper l1{ + {{}}, + {{"arg"}, {"mno", "ampere"}, {"gpu"}, {"def"}}, + {{"", "hhh"}}, + {{"warp", "donuts", "parking"}, {"", "apply", "twelve", "mouse", "bbb"}, {"bbb", "pom"}, {}}, + {{}}}; + + // perform the concatenate + cudf::table_view t({l0, l1}); + auto result = cudf::lists::concatenate_rows(t); + + // expected + cudf::test::lists_column_wrapper expected{ + {{"whee", "yay", "bananas"}, {}, {}}, + {{}, {"arg"}, {"mno", "ampere"}, {"gpu"}, {"def"}}, + {{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"}, {"", "hhh"}}, + {{"f", "tesla"}, + {"warp", "donuts", "parking"}, + {"", "apply", "twelve", "mouse", "bbb"}, + {"bbb", "pom"}, + {}}, + {{"phone"}, {"hack", "table", "car"}, {}}}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected); +} + +TEST_F(ListConcatenateRowsNestedTypesTest, ListWithNulls) +{ + // list> + + // clang-format off + + // col 0 + cudf::test::lists_column_wrapper + l0{ { + {{{"whee", "yay", "bananas"}, nulls_at({1})}, {}}, + {{}}, + {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"}}, nulls_at({0, 2})}, + {{"f", "tesla"}}, + {{"phone"}, {"hack", "table", "car"}} + }, nulls_at({3, 4}) }; + + // col1 + cudf::test::lists_column_wrapper + l1{ { + {{}}, + {{"arg"}, {"mno", "ampere"}, {"gpu"}, {"def"}}, + {{{{"", "hhh"}, nulls_at({0})}, {"www"}}, nulls_at({1})}, + {{"warp", "donuts", "parking"}, { "", "apply", "twelve", "mouse", "bbb"}, {"bbb", "pom"}, {}}, + {{}} + }, nulls_at({4}) }; + + // col2 + cudf::test::lists_column_wrapper + l2{ { + {{"monitor", "sugar"}}, + {{"spurs", "garlic"}, {"onion", "shallot", "carrot"}}, + {{"cars", "trucks", "planes"}, {"abc"}, {"mno", "pqr"}}, + {{}, {"ram", "cpu", "disk"}, {}}, + {{"round"}, {"square"}} + }, nulls_at({0, 4}) }; + + // concatenate_policy::IGNORE_NULLS + { + // perform the concatenate + cudf::table_view t({l0, l1, l2}); + auto result = cudf::lists::concatenate_rows(t, cudf::lists::concatenate_null_policy::IGNORE); + + // expected + cudf::test::lists_column_wrapper + expected{ { + {{{"whee", "yay", "bananas"}, nulls_at({1})}, {}, {}}, + {{}, {"arg"}, {"mno", "ampere"}, {"gpu"}, {"def"}, {"spurs", "garlic"}, {"onion", "shallot", "carrot"}}, + {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"}, + {{"", "hhh"}, nulls_at({0})}, {"www"}, {"cars", "trucks", "planes"}, {"abc"}, {"mno", "pqr"}}, + nulls_at({0, 2, 4}) }, + {{"warp", "donuts", "parking"}, { "", "apply", "twelve", "mouse", "bbb"}, {"bbb", "pom"}, {}, {}, {"ram", "cpu", "disk"}, {}}, + {{}} + }, nulls_at({4}) }; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected); + } + + // concatenate_policy::NULLIFY_OUTPUT_ROW + { + // perform the concatenate + cudf::table_view t({l0, l1, l2}); + auto result = cudf::lists::concatenate_rows(t, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + + // expected + cudf::test::lists_column_wrapper + expected{ { + {{}}, + {{}, {"arg"}, {"mno", "ampere"}, {"gpu"}, {"def"}, {"spurs", "garlic"}, {"onion", "shallot", "carrot"}}, + {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"}, + {{"", "hhh"}, nulls_at({0})}, {"www"}, {"cars", "trucks", "planes"}, {"abc"}, {"mno", "pqr"}}, + nulls_at({0, 2, 4}) }, + {{}}, + {{}} + }, nulls_at({0, 3, 4}) }; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected); + } + + // clang-format on +} + +TEST_F(ListConcatenateRowsNestedTypesTest, ListWithNullsSliced) +{ + // list> + + // clang-format off + + // col 0 + cudf::test::lists_column_wrapper + unsliced_l0{ { + {{{"whee", "yay", "bananas"}, nulls_at({1})}, {}}, + {{}}, + {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"}}, nulls_at({0, 2})}, + {{"f", "tesla"}}, + {{"phone"}, {"hack", "table", "car"}} + }, nulls_at({3, 4}) }; + auto l0 = cudf::split(unsliced_l0, {2})[1]; + + // col1 + cudf::test::lists_column_wrapper + unsliced_l1{ { + {{}}, + {{"arg"}, {"mno", "ampere"}, {"gpu"}, {"def"}}, + {{{{"", "hhh"}, nulls_at({0})}, {"www"}}, nulls_at({1})}, + {{"warp", "donuts", "parking"}, { "", "apply", "twelve", "mouse", "bbb"}, {"bbb", "pom"}, {}}, + {{}} + }, nulls_at({4}) }; + auto l1 = cudf::split(unsliced_l1, {2})[1]; + + // concatenate_policy::IGNORE_NULLS + { + // perform the concatenate + cudf::table_view t({l0, l1}); + auto result = cudf::lists::concatenate_rows(t, cudf::lists::concatenate_null_policy::IGNORE); + + // expected + cudf::test::lists_column_wrapper + expected{ { {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"}, + {{"", "hhh"}, nulls_at({0})}, {"www"}}, nulls_at({0, 2, 4}) }, + {{"warp", "donuts", "parking"}, { "", "apply", "twelve", "mouse", "bbb"}, {"bbb", "pom"}, {}}, + {{}} + }, nulls_at({2}) }; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected); + } + + // concatenate_policy::NULLIFY_OUTPUT_ROW + { + // perform the concatenate + cudf::table_view t({l0, l1}); + auto result = cudf::lists::concatenate_rows(t, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + + // expected + cudf::test::lists_column_wrapper + expected{ { {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"}, + {{"", "hhh"}, nulls_at({0})}, {"www"}}, nulls_at({0, 2, 4}) }, + {{}}, + {{}} + }, nulls_at({1, 2}) }; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected); + } + + // clang-format on +} + +TEST_F(ListConcatenateRowsNestedTypesTest, Struct) +{ + // list> + + // col 0 + cudf::test::fixed_width_column_wrapper s0_0{0, 1, 2, 3, 4, 5, 6, 7}; + cudf::test::strings_column_wrapper s0_1{ + "whee", "yay", "bananas", "abc", "def", "g", "xyw", "ijk"}; + std::vector> s0_children; + s0_children.push_back(s0_0.release()); + s0_children.push_back(s0_1.release()); + cudf::test::structs_column_wrapper s0(std::move(s0_children)); + cudf::test::fixed_width_column_wrapper l0_offsets{0, 2, 2, 5, 6, 8}; + auto const l0_size = static_cast(l0_offsets).size() - 1; + auto l0 = cudf::make_lists_column(l0_size, l0_offsets.release(), s0.release(), 0, {}); + + // col1 + cudf::test::fixed_width_column_wrapper s1_0{ + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}; + cudf::test::strings_column_wrapper s1_1{"arg", + "mno", + "ampere", + "gpu", + "", + "hhh", + "warp", + "donuts", + "parking", + "", + "apply", + "twelve", + "mouse", + "bbb", + "pom"}; + std::vector> s1_children; + s1_children.push_back(s1_0.release()); + s1_children.push_back(s1_1.release()); + cudf::test::structs_column_wrapper s1(std::move(s1_children)); + cudf::test::fixed_width_column_wrapper l1_offsets{0, 0, 4, 7, 15, 15}; + auto const l1_size = static_cast(l1_offsets).size() - 1; + auto l1 = cudf::make_lists_column(l1_size, l1_offsets.release(), s1.release(), 0, {}); + + // perform the concatenate + cudf::table_view t({*l0, *l1}); + auto result = cudf::lists::concatenate_rows(t); + + // expected + cudf::test::fixed_width_column_wrapper se_0{0, 1, 10, 11, 12, 13, 2, 3, 4, 14, 15, 16, + 5, 17, 18, 19, 20, 21, 22, 23, 24, 6, 7}; + cudf::test::strings_column_wrapper se_1{"whee", "yay", "arg", "mno", "ampere", "gpu", + "bananas", "abc", "def", "", "hhh", "warp", + "g", "donuts", "parking", "", "apply", "twelve", + "mouse", "bbb", "pom", "xyw", "ijk"}; + std::vector> se_children; + se_children.push_back(se_0.release()); + se_children.push_back(se_1.release()); + cudf::test::structs_column_wrapper se(std::move(se_children)); + cudf::test::fixed_width_column_wrapper le_offsets{0, 2, 6, 12, 21, 23}; + auto const le_size = static_cast(le_offsets).size() - 1; + auto expected = cudf::make_lists_column(le_size, le_offsets.release(), se.release(), 0, {}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); +} + +TEST_F(ListConcatenateRowsNestedTypesTest, StructWithNulls) +{ + // list> + + // col 0 + cudf::test::fixed_width_column_wrapper s0_0{0, 1, 2, 3, 4, 5, 6, 7}; + cudf::test::strings_column_wrapper s0_1{ + {"whee", "yay", "bananas", "abc", "def", "g", "xyw", "ijk"}, nulls_at({1, 3, 4})}; + std::vector> s0_children; + s0_children.push_back(s0_0.release()); + s0_children.push_back(s0_1.release()); + cudf::test::structs_column_wrapper s0(std::move(s0_children)); + cudf::test::fixed_width_column_wrapper l0_offsets{0, 2, 2, 5, 6, 8}; + auto const l0_size = static_cast(l0_offsets).size() - 1; + std::vector l0_validity{false, true, true, false, true}; + auto l0 = cudf::make_lists_column( + l0_size, + l0_offsets.release(), + s0.release(), + 2, + cudf::test::detail::make_null_mask(l0_validity.begin(), l0_validity.end())); + + // col1 + cudf::test::fixed_width_column_wrapper s1_0{ + {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}, nulls_at({14})}; + cudf::test::strings_column_wrapper s1_1{"arg", + "mno", + "ampere", + "gpu", + "", + "hhh", + "warp", + "donuts", + "parking", + "", + "apply", + "twelve", + "mouse", + "bbb", + "pom"}; + std::vector> s1_children; + s1_children.push_back(s1_0.release()); + s1_children.push_back(s1_1.release()); + cudf::test::structs_column_wrapper s1(std::move(s1_children)); + cudf::test::fixed_width_column_wrapper l1_offsets{0, 0, 4, 7, 15, 15}; + auto const l1_size = static_cast(l1_offsets).size() - 1; + std::vector l1_validity{false, true, true, true, true}; + auto l1 = cudf::make_lists_column( + l1_size, + l1_offsets.release(), + s1.release(), + 1, + cudf::test::detail::make_null_mask(l1_validity.begin(), l1_validity.end())); + + // concatenate_policy::IGNORE_NULLS + { + // perform the concatenate + cudf::table_view t({*l0, *l1}); + auto result = cudf::lists::concatenate_rows(t, cudf::lists::concatenate_null_policy::IGNORE); + + // expected + cudf::test::fixed_width_column_wrapper se_0{ + {10, 11, 12, 13, 2, 3, 4, 14, 15, 16, 5, 17, 18, 19, 20, 21, 22, 23, 24, 6, 7}, + nulls_at({18})}; + cudf::test::strings_column_wrapper se_1{ + {"arg", "mno", "ampere", "gpu", "bananas", "", "", "", "hhh", "warp", "g", + "donuts", "parking", "", "apply", "twelve", "mouse", "bbb", "pom", "xyw", "ijk"}, + nulls_at({5, 6})}; + std::vector> se_children; + se_children.push_back(se_0.release()); + se_children.push_back(se_1.release()); + cudf::test::structs_column_wrapper se(std::move(se_children)); + cudf::test::fixed_width_column_wrapper le_offsets{0, 0, 4, 10, 19, 21}; + auto const le_size = static_cast(le_offsets).size() - 1; + std::vector le_validity{false, true, true, true, true}; + auto expected = cudf::make_lists_column( + le_size, + le_offsets.release(), + se.release(), + 1, + cudf::test::detail::make_null_mask(le_validity.begin(), le_validity.end())); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); + } + + // concatenate_policy::NULLIFY_OUTPUT_ROW + { + // perform the concatenate + cudf::table_view t({*l0, *l1}); + auto result = + cudf::lists::concatenate_rows(t, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + + // expected + cudf::test::fixed_width_column_wrapper se_0{{10, 11, 12, 13, 2, 3, 4, 14, 15, 16, 6, 7}, + nulls_at({})}; + cudf::test::strings_column_wrapper se_1{ + {"arg", "mno", "ampere", "gpu", "bananas", "", "", "", "hhh", "warp", "xyw", "ijk"}, + nulls_at({5, 6})}; + std::vector> se_children; + se_children.push_back(se_0.release()); + se_children.push_back(se_1.release()); + cudf::test::structs_column_wrapper se(std::move(se_children)); + cudf::test::fixed_width_column_wrapper le_offsets{0, 0, 4, 10, 10, 12}; + auto const le_size = static_cast(le_offsets).size() - 1; + std::vector le_validity{false, true, true, false, true}; + auto expected = cudf::make_lists_column( + le_size, + le_offsets.release(), + se.release(), + 2, + cudf::test::detail::make_null_mask(le_validity.begin(), le_validity.end())); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); + } +} + +TEST_F(ListConcatenateRowsNestedTypesTest, StructWithNullsSliced) +{ + // list> + + // col 0 + cudf::test::fixed_width_column_wrapper s0_0{0, 1, 2, 3, 4, 5, 6, 7}; + cudf::test::strings_column_wrapper s0_1{ + {"whee", "yay", "bananas", "abc", "def", "g", "xyw", "ijk"}, nulls_at({1, 3, 4})}; + std::vector> s0_children; + s0_children.push_back(s0_0.release()); + s0_children.push_back(s0_1.release()); + cudf::test::structs_column_wrapper s0(std::move(s0_children)); + cudf::test::fixed_width_column_wrapper l0_offsets{0, 2, 2, 5, 6, 8}; + auto const l0_size = static_cast(l0_offsets).size() - 1; + std::vector l0_validity{false, true, false, false, true}; + auto l0_unsliced = cudf::make_lists_column( + l0_size, + l0_offsets.release(), + s0.release(), + 2, + cudf::test::detail::make_null_mask(l0_validity.begin(), l0_validity.end())); + auto l0 = cudf::split(*l0_unsliced, {2})[1]; + + // col1 + cudf::test::fixed_width_column_wrapper s1_0{ + {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}, nulls_at({14})}; + cudf::test::strings_column_wrapper s1_1{"arg", + "mno", + "ampere", + "gpu", + "", + "hhh", + "warp", + "donuts", + "parking", + "", + "apply", + "twelve", + "mouse", + "bbb", + "pom"}; + std::vector> s1_children; + s1_children.push_back(s1_0.release()); + s1_children.push_back(s1_1.release()); + cudf::test::structs_column_wrapper s1(std::move(s1_children)); + cudf::test::fixed_width_column_wrapper l1_offsets{0, 0, 4, 7, 15, 15}; + auto const l1_size = static_cast(l1_offsets).size() - 1; + std::vector l1_validity{false, true, false, true, true}; + auto l1_unsliced = cudf::make_lists_column( + l1_size, + l1_offsets.release(), + s1.release(), + 1, + cudf::test::detail::make_null_mask(l1_validity.begin(), l1_validity.end())); + auto l1 = cudf::split(*l1_unsliced, {2})[1]; + + // concatenate_policy::IGNORE_NULLS + { + // perform the concatenate + cudf::table_view t({l0, l1}); + auto result = cudf::lists::concatenate_rows(t, cudf::lists::concatenate_null_policy::IGNORE); + + // expected + cudf::test::fixed_width_column_wrapper se_0{{5, 17, 18, 19, 20, 21, 22, 23, 24, 6, 7}, + nulls_at({8})}; + cudf::test::strings_column_wrapper se_1{ + {"g", "donuts", "parking", "", "apply", "twelve", "mouse", "bbb", "pom", "xyw", "ijk"}, + nulls_at({})}; + std::vector> se_children; + se_children.push_back(se_0.release()); + se_children.push_back(se_1.release()); + cudf::test::structs_column_wrapper se(std::move(se_children)); + cudf::test::fixed_width_column_wrapper le_offsets{0, 0, 9, 11}; + auto const le_size = static_cast(le_offsets).size() - 1; + std::vector le_validity{false, true, true}; + auto expected = cudf::make_lists_column( + le_size, + le_offsets.release(), + se.release(), + 1, + cudf::test::detail::make_null_mask(le_validity.begin(), le_validity.end())); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); + } + + // concatenate_policy::NULLIFY_OUTPUT_ROW + { + // perform the concatenate + cudf::table_view t({l0, l1}); + auto result = + cudf::lists::concatenate_rows(t, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + + // expected + cudf::test::fixed_width_column_wrapper se_0{{6, 7}, nulls_at({})}; + cudf::test::strings_column_wrapper se_1{{"xyw", "ijk"}, nulls_at({})}; + std::vector> se_children; + se_children.push_back(se_0.release()); + se_children.push_back(se_1.release()); + cudf::test::structs_column_wrapper se(std::move(se_children)); + cudf::test::fixed_width_column_wrapper le_offsets{0, 0, 0, 2}; + auto const le_size = static_cast(le_offsets).size() - 1; + std::vector le_validity{false, false, true}; + auto expected = cudf::make_lists_column( + le_size, + le_offsets.release(), + se.release(), + 2, + cudf::test::detail::make_null_mask(le_validity.begin(), le_validity.end())); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); + } +} diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 492560f7b7f..0c279a1e788 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2861,16 +2861,31 @@ void testListConcatByRow() { assertColumnsAreEqual(expect, result); } - assertThrows(CudfException.class, () -> { - try (ColumnVector cv = ColumnVector.fromInts(1, 2, 3); - ColumnVector result = ColumnVector.listConcatenateByRow(cv, cv)) { - } - }); - - assertThrows(CudfException.class, () -> { - try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true, + try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true, new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32))), Arrays.asList(Arrays.asList(1))); + ColumnVector result = ColumnVector.listConcatenateByRow(cv, cv); + ColumnVector expect = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.INT32))), Arrays.asList(Arrays.asList(1), Arrays.asList(1)))){ + assertColumnsAreEqual(expect, result); + } + + try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.INT32))), Arrays.asList(Arrays.asList(1, null, 2))); + ColumnVector cv2 = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.INT32))), Arrays.asList(Arrays.asList(null, null, 5, 6, null))); + ColumnVector result = ColumnVector.listConcatenateByRow(cv1, cv2); + ColumnVector expect = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.INT32))), Arrays.asList(Arrays.asList(1, null, 2), Arrays.asList(null, null, 5, 6, null)))){ + assertColumnsAreEqual(expect, result); + } + + assertThrows(CudfException.class, () -> { + try (ColumnVector cv = ColumnVector.fromInts(1, 2, 3); ColumnVector result = ColumnVector.listConcatenateByRow(cv, cv)) { } });