From 41a7baa5b324976eabdda187631e55fc7f636f19 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 23 May 2022 14:54:32 -0700 Subject: [PATCH 01/51] Switch to use `generate_list_labels` --- cpp/src/lists/drop_list_duplicates.cu | 49 +++++---------------------- 1 file changed, 8 insertions(+), 41 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 8a4704ad13b..62372612f5d 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -152,44 +153,6 @@ struct replace_negative_nans_dispatch { } }; -/** - * @brief Populate 1-based list indices for all list entries. - * - * Given a number of total list entries in a lists column and an array containing list offsets, - * generate an array that maps each list entry to a 1-based index of the list containing - * that entry. - * - * Instead of regular 0-based indices, we need to use 1-based indices for later post-processing. - * - * @code{.pseudo} - * num_lists = 3, num_entries = 10, offsets = { 0, 4, 6, 10 } - * output = { 1, 1, 1, 1, 2, 2, 3, 3, 3, 3 } - * @endcode - * - * @param num_lists The size of the input lists column. - * @param num_entries The number of entries in the lists column. - * @param offsets_begin The pointer refers to data of list offsets. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @return An array containing 1-based list indices corresponding to each list entry. - */ -rmm::device_uvector generate_entry_list_indices(size_type num_lists, - size_type num_entries, - offset_type const* offsets_begin, - rmm::cuda_stream_view stream) -{ - auto entry_list_indices = rmm::device_uvector(num_entries, stream); - - auto const input = thrust::make_transform_iterator( - offsets_begin, [offsets_begin] __device__(auto const idx) { return idx - *offsets_begin; }); - thrust::upper_bound(rmm::exec_policy(stream), - input, - input + num_lists, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_entries), - entry_list_indices.begin()); - return entry_list_indices; -} - /** * @brief Perform an equality comparison between two entries in a lists column, specialized from * `cudf::element_equality_comparator` to take into account both parameters `nulls_equal` and @@ -570,9 +533,13 @@ std::pair, std::unique_ptr> drop_list_duplicates // The child column containing list entries. auto const keys_child = keys.get_sliced_child(stream); - // Generate a mapping from list entries to their 1-based list indices for the keys column. - auto const entries_list_indices = - generate_entry_list_indices(keys.size(), keys_child.size(), keys.offsets_begin(), stream); + // Generate a mapping from list entries to their list indices for the keys column. + auto const entries_list_indices = [&] { + auto labels = rmm::device_uvector(keys_child.size(), stream); + cudf::lists::detail::generate_list_labels( + keys.offsets_begin(), keys.offsets_end(), labels.begin(), keys_child.size(), stream); + return labels; + }(); // Generate segmented sorted order for key entries. // The keys column will be sorted (gathered) using this order. From 1204e9a539edd072e8756cef5ede96a0b28c2ea8 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 23 May 2022 15:00:27 -0700 Subject: [PATCH 02/51] Remove comments --- cpp/src/lists/drop_list_duplicates.cu | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 62372612f5d..ba7f6ec8a1e 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -451,23 +451,10 @@ std::unique_ptr generate_output_offsets(size_type num_lists, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - // Let consider an example: - // Given the original offsets of the input lists column is [0, 4, 5, 6, 7, 10, 11, 13]. - // The original entries_list_indices is [1, 1, 1, 1, 2, 3, 4, 5, 5, 5, 6, 7, 7], and after - // extracting unique entries we have the entries_list_indices becomes [1, 1, 1, 4, 5, 5, 5, 7, 7] - // and num_lists is 7. These are the input to this function. - // - // Through extracting unique list entries, one entry in the list index 1 has been removed (first - // list, as we are using 1-based list index), and entries in the lists with indices {3, 3, 6} have - // been removed completely. - - // This variable stores the (1-based) list indices of the unique entries but only one index value - // per non-empty list. Given the example above, we will have this array hold the values - // [1, 4, 5, 7]. + // Stores the unique list indices of unique entries (i.e., at max one list index per list). auto list_indices = rmm::device_uvector(num_lists, stream); // Stores the non-zero numbers of unique entries per list. - // Given the example above, we will have this array contains the values [3, 1, 3, 2] auto list_sizes = rmm::device_uvector(num_lists, stream); // Count the numbers of unique entries for each non-empty list. @@ -487,7 +474,6 @@ std::unique_ptr generate_output_offsets(size_type num_lists, rmm::exec_policy(stream), new_offsets.begin(), num_lists + 1, offset_type{0}); // Scatter non-zero sizes of the output lists into the correct positions. - // Given the example above, we will have new_offsets = [0, 3, 0, 0, 1, 3, 0, 2] thrust::scatter(rmm::exec_policy(stream), list_sizes.begin(), list_sizes.begin() + num_non_empty_lists, @@ -495,11 +481,9 @@ std::unique_ptr generate_output_offsets(size_type num_lists, new_offsets.begin()); // Generate offsets from sizes. - // Given the example above, we will have new_offsets = [0, 3, 3, 3, 4, 7, 7, 9] thrust::inclusive_scan( rmm::exec_policy(stream), new_offsets.begin(), new_offsets.end(), new_offsets.begin()); - // Done. Hope that your head didn't explode after reading till this point. return std::make_unique( data_type{type_to_id()}, num_lists + 1, new_offsets.release()); } From 165f75244ac9bb0aa5af8d18f658ba4eb9957578 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 23 May 2022 16:12:09 -0700 Subject: [PATCH 03/51] Switch to use 0-based list labels --- cpp/src/lists/drop_list_duplicates.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index ba7f6ec8a1e..5ed48a8d36f 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -481,7 +481,7 @@ std::unique_ptr generate_output_offsets(size_type num_lists, new_offsets.begin()); // Generate offsets from sizes. - thrust::inclusive_scan( + thrust::exclusive_scan( rmm::exec_policy(stream), new_offsets.begin(), new_offsets.end(), new_offsets.begin()); return std::make_unique( @@ -520,8 +520,8 @@ std::pair, std::unique_ptr> drop_list_duplicates // Generate a mapping from list entries to their list indices for the keys column. auto const entries_list_indices = [&] { auto labels = rmm::device_uvector(keys_child.size(), stream); - cudf::lists::detail::generate_list_labels( - keys.offsets_begin(), keys.offsets_end(), labels.begin(), keys_child.size(), stream); + cudf::lists::detail::fill_segmented_labels( + keys.offsets_begin(), keys.offsets_end(), labels.begin(), labels.end(), stream); return labels; }(); From 3f385eba720e7215bcd3c3cd8c92351abd104b16 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 23 May 2022 21:20:15 -0700 Subject: [PATCH 04/51] Implement `fill_segmented_labels` --- cpp/src/lists/labelling.cuh | 80 +++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 cpp/src/lists/labelling.cuh diff --git a/cpp/src/lists/labelling.cuh b/cpp/src/lists/labelling.cuh new file mode 100644 index 00000000000..039f1076e79 --- /dev/null +++ b/cpp/src/lists/labelling.cuh @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf::lists::detail { + +/** + * @brief Fill label values for segments defined by a given offsets array. + * + * Given a pair of iterators accessing to an offset array, generate label values for segments + * defined by the offset values. The output will be an array containing consecutive groups of + * identical labels, the number of elements in each group `i` is defined by + * `offsets[i+1] - offsets[i]`. + * + * The labels always start from `0` regardless of the offset values. + * + * @code{.pseudo} + * Examples: + * + * offsets = { 0, 4, 6, 10 } + * output = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 2 } + * + * offsets = { 5, 10, 12 } + * output = { 0, 0, 0, 0, 0, 1, 1 } + * @endcode + * + * @param offsets_begin The beginning of the offsets that define segments. + * @param offsets_end The end of the offsets that define segments. + * @param out_begin The beginning of the output label range. + * @param out_end The end of the output label range. + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +template +void fill_segmented_labels(InputIterator offsets_begin, + InputIterator offsets_end, + OutputIterator out_begin, + OutputIterator out_end, + rmm::cuda_stream_view stream) +{ + // Make the offset values starting from `0`. + auto const input = thrust::make_transform_iterator( + offsets_begin, [offsets_begin] __device__(auto const idx) { return idx - *offsets_begin; }); + + using OutputType = typename thrust::iterator_value::type; + auto const output = thrust::make_transform_output_iterator( + out_begin, [] __device__(auto const idx) { return idx - OutputType{1}; }); + + thrust::upper_bound(rmm::exec_policy(stream), + input, + input + thrust::distance(offsets_begin, offsets_end), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator( + static_cast(thrust::distance(out_begin, out_end))), + output); +} + +} // namespace cudf::lists::detail From 26846258828cc22667390ac5cb0d32659811be6e Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 23 May 2022 21:51:16 -0700 Subject: [PATCH 05/51] Move file and change file name --- cpp/{src/lists/labelling.cuh => include/cudf/detail/fill.cuh} | 4 ++-- cpp/src/lists/drop_list_duplicates.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) rename cpp/{src/lists/labelling.cuh => include/cudf/detail/fill.cuh} (97%) diff --git a/cpp/src/lists/labelling.cuh b/cpp/include/cudf/detail/fill.cuh similarity index 97% rename from cpp/src/lists/labelling.cuh rename to cpp/include/cudf/detail/fill.cuh index 039f1076e79..6c00af5305a 100644 --- a/cpp/src/lists/labelling.cuh +++ b/cpp/include/cudf/detail/fill.cuh @@ -25,7 +25,7 @@ #include #include -namespace cudf::lists::detail { +namespace cudf::detail { /** * @brief Fill label values for segments defined by a given offsets array. @@ -77,4 +77,4 @@ void fill_segmented_labels(InputIterator offsets_begin, output); } -} // namespace cudf::lists::detail +} // namespace cudf::detail diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 5ed48a8d36f..8c123953b86 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -14,11 +14,11 @@ * limitations under the License. */ -#include #include #include #include +#include #include #include #include @@ -520,7 +520,7 @@ std::pair, std::unique_ptr> drop_list_duplicates // Generate a mapping from list entries to their list indices for the keys column. auto const entries_list_indices = [&] { auto labels = rmm::device_uvector(keys_child.size(), stream); - cudf::lists::detail::fill_segmented_labels( + cudf::detail::fill_segmented_labels( keys.offsets_begin(), keys.offsets_end(), labels.begin(), labels.end(), stream); return labels; }(); From 0cfe856b277f16dd8f12a1bc6175a6c230793b07 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 23 May 2022 21:51:44 -0700 Subject: [PATCH 06/51] Use `fill_segmented_labels` in groupby --- cpp/src/groupby/sort/sort_helper.cu | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 10201782854..cf14573a64c 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -226,19 +227,11 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels( if (num_keys(stream) == 0) return group_labels; - thrust::uninitialized_fill(rmm::exec_policy(stream), - group_labels.begin(), - group_labels.end(), - index_vector::value_type{0}); - thrust::scatter(rmm::exec_policy(stream), - thrust::make_constant_iterator(1, decltype(num_groups(stream))(1)), - thrust::make_constant_iterator(1, num_groups(stream)), - group_offsets(stream).begin() + 1, - group_labels.begin()); - - thrust::inclusive_scan( - rmm::exec_policy(stream), group_labels.begin(), group_labels.end(), group_labels.begin()); - + cudf::detail::fill_segmented_labels(group_offsets(stream).begin(), + group_offsets(stream).end(), + group_labels.begin(), + group_labels.end(), + stream); return group_labels; } From 100717615952af4b89e488a796c5698a59a2ff2d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 23 May 2022 22:04:18 -0700 Subject: [PATCH 07/51] Add comment --- cpp/include/cudf/detail/fill.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/include/cudf/detail/fill.cuh b/cpp/include/cudf/detail/fill.cuh index 6c00af5305a..dd00fb61687 100644 --- a/cpp/include/cudf/detail/fill.cuh +++ b/cpp/include/cudf/detail/fill.cuh @@ -64,6 +64,8 @@ void fill_segmented_labels(InputIterator offsets_begin, auto const input = thrust::make_transform_iterator( offsets_begin, [offsets_begin] __device__(auto const idx) { return idx - *offsets_begin; }); + // The output labels from `upper_bound` will start from `1`. + // This will shift the result values back to start from `0`. using OutputType = typename thrust::iterator_value::type; auto const output = thrust::make_transform_output_iterator( out_begin, [] __device__(auto const idx) { return idx - OutputType{1}; }); From 9b5a88d4c64f07743dfbff29ae560cfa94820b76 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 24 May 2022 12:52:31 -0700 Subject: [PATCH 08/51] Add example --- cpp/src/lists/drop_list_duplicates.cu | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 8c123953b86..1ab3a84ad18 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -451,10 +451,18 @@ std::unique_ptr generate_output_offsets(size_type num_lists, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - // Stores the unique list indices of unique entries (i.e., at max one list index per list). + // Let consider an example: + // Given the input lists column with offsets are [0, 4, 7, 7, 10], num_lists is 4, and + // entries_list_indices is [0, 0, 0, 0, 1, 1, 1, 3, 3, 3]. + // After extracting unique entries we have the entries_list_indices becomes + // [0, 0, 1, 3, 3]. These are the input to this function. + + // This stores the unique list indices of unique entries (i.e., at max one list index per list). + // Given the example above, we will have this array hold the values [0, 1, 3]. auto list_indices = rmm::device_uvector(num_lists, stream); // Stores the non-zero numbers of unique entries per list. + // Given the example above, we will have this array contains the values [2, 1, 2] auto list_sizes = rmm::device_uvector(num_lists, stream); // Count the numbers of unique entries for each non-empty list. @@ -474,6 +482,8 @@ std::unique_ptr generate_output_offsets(size_type num_lists, rmm::exec_policy(stream), new_offsets.begin(), num_lists + 1, offset_type{0}); // Scatter non-zero sizes of the output lists into the correct positions. + // Given the example above, we scatter [2, 1, 2] by the scatter_map [0, 1, 3] and will have + // new_offsets = [2, 1, 0, 2, 0] thrust::scatter(rmm::exec_policy(stream), list_sizes.begin(), list_sizes.begin() + num_non_empty_lists, @@ -481,6 +491,7 @@ std::unique_ptr generate_output_offsets(size_type num_lists, new_offsets.begin()); // Generate offsets from sizes. + // Given the example above, we will have new_offsets = [0, 2, 3, 3, 5] thrust::exclusive_scan( rmm::exec_policy(stream), new_offsets.begin(), new_offsets.end(), new_offsets.begin()); From 9903007d7d32b8d299f944bed7e2036175f93ad9 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 24 May 2022 12:56:54 -0700 Subject: [PATCH 09/51] Rename and move file --- .../{fill.cuh => labeling/label_segments.cuh} | 10 +++++----- cpp/src/groupby/sort/sort_helper.cu | 13 ++++++------- cpp/src/lists/drop_list_duplicates.cu | 4 ++-- 3 files changed, 13 insertions(+), 14 deletions(-) rename cpp/include/cudf/detail/{fill.cuh => labeling/label_segments.cuh} (91%) diff --git a/cpp/include/cudf/detail/fill.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh similarity index 91% rename from cpp/include/cudf/detail/fill.cuh rename to cpp/include/cudf/detail/labeling/label_segments.cuh index dd00fb61687..248f00404fc 100644 --- a/cpp/include/cudf/detail/fill.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -54,11 +54,11 @@ namespace cudf::detail { * @param stream CUDA stream used for device memory operations and kernel launches. */ template -void fill_segmented_labels(InputIterator offsets_begin, - InputIterator offsets_end, - OutputIterator out_begin, - OutputIterator out_end, - rmm::cuda_stream_view stream) +void label_segments(InputIterator offsets_begin, + InputIterator offsets_end, + OutputIterator out_begin, + OutputIterator out_end, + rmm::cuda_stream_view stream) { // Make the offset values starting from `0`. auto const input = thrust::make_transform_iterator( diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index cf14573a64c..bf3171f1da8 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -17,11 +17,11 @@ #include #include #include -#include #include #include #include #include +#include #include #include #include @@ -224,14 +224,13 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels( _group_labels = std::make_unique(num_keys(stream), stream); auto& group_labels = *_group_labels; - if (num_keys(stream) == 0) return group_labels; - cudf::detail::fill_segmented_labels(group_offsets(stream).begin(), - group_offsets(stream).end(), - group_labels.begin(), - group_labels.end(), - stream); + cudf::detail::label_segments(group_offsets(stream).begin(), + group_offsets(stream).end(), + group_labels.begin(), + group_labels.end(), + stream); return group_labels; } diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 1ab3a84ad18..49e0e944fb8 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -18,9 +18,9 @@ #include #include -#include #include #include +#include #include #include #include @@ -531,7 +531,7 @@ std::pair, std::unique_ptr> drop_list_duplicates // Generate a mapping from list entries to their list indices for the keys column. auto const entries_list_indices = [&] { auto labels = rmm::device_uvector(keys_child.size(), stream); - cudf::detail::fill_segmented_labels( + cudf::detail::label_segments( keys.offsets_begin(), keys.offsets_end(), labels.begin(), labels.end(), stream); return labels; }(); From 7bd714d31c79f2302a7c8eb3ad09a2c6e7141126 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 24 May 2022 12:59:15 -0700 Subject: [PATCH 10/51] Rename variable --- cpp/include/cudf/detail/labeling/label_segments.cuh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 248f00404fc..20ad6b565ba 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -60,8 +60,7 @@ void label_segments(InputIterator offsets_begin, OutputIterator out_end, rmm::cuda_stream_view stream) { - // Make the offset values starting from `0`. - auto const input = thrust::make_transform_iterator( + auto const zero_normalized_offsets = thrust::make_transform_iterator( offsets_begin, [offsets_begin] __device__(auto const idx) { return idx - *offsets_begin; }); // The output labels from `upper_bound` will start from `1`. @@ -71,8 +70,8 @@ void label_segments(InputIterator offsets_begin, out_begin, [] __device__(auto const idx) { return idx - OutputType{1}; }); thrust::upper_bound(rmm::exec_policy(stream), - input, - input + thrust::distance(offsets_begin, offsets_end), + zero_normalized_offsets, + zero_normalized_offsets + thrust::distance(offsets_begin, offsets_end), thrust::make_counting_iterator(0), thrust::make_counting_iterator( static_cast(thrust::distance(out_begin, out_end))), From 031302ba1f695e7682b40579e5465a4bf3d6a6b6 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 May 2022 15:13:37 -0700 Subject: [PATCH 11/51] Add a benchmark --- cpp/benchmarks/groupby/group_struct.cu | 167 +++++++++++++++++-------- 1 file changed, 114 insertions(+), 53 deletions(-) diff --git a/cpp/benchmarks/groupby/group_struct.cu b/cpp/benchmarks/groupby/group_struct.cu index c5eceda2df2..6c640c64050 100644 --- a/cpp/benchmarks/groupby/group_struct.cu +++ b/cpp/benchmarks/groupby/group_struct.cu @@ -18,78 +18,139 @@ #include #include -#include -#include -#include -#include +#include -static constexpr cudf::size_type num_struct_members = 8; -static constexpr cudf::size_type max_int = 100; -static constexpr cudf::size_type max_str_length = 32; +#include +#include +#include +#include +#include +#include +#include +#include +#include -static auto create_data_table(cudf::size_type n_rows) +#include +#include + +//================================================================================================== +auto create_offsets(cudf::size_type n_groups, rmm::cuda_stream_view stream) { - data_profile table_profile; + // This is the maximum size of each group. + constexpr cudf::size_type max_int = 1000; + + auto table_profile = data_profile{}; table_profile.set_distribution_params(cudf::type_id::INT32, distribution_id::UNIFORM, 0, max_int); - table_profile.set_distribution_params( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - - // The first two struct members are int32 and string. - // The first column is also used as keys in groupby. - // The subsequent struct members are int32 and string again. - return create_random_table( - cycle_dtypes({cudf::type_id::INT32, cudf::type_id::STRING}, num_struct_members), - row_count{n_rows}, - table_profile); + auto sizes = + std::move(create_random_table({cudf::type_id::INT32}, row_count{n_groups}, table_profile) + ->release() + .front()); + auto const sizes_view = sizes->mutable_view(); + + thrust::exclusive_scan(rmm::exec_policy(), + sizes_view.template begin(), + sizes_view.template end(), + sizes_view.template begin()); + + cudf::size_type n_elements; + CUDF_CUDA_TRY(cudaMemcpyAsync(&n_elements, + sizes_view.template end() - 1, + sizeof(cudf::size_type), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); + + return std::pair{std::move(sizes), n_elements}; +} + +//================================================================================================== +template +void old_way(InputIterator offsets_begin, + InputIterator offsets_end, + OutputIterator out_begin, + OutputIterator out_end, + rmm::cuda_stream_view stream) +{ + thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, cudf::size_type{0}); + thrust::scatter( + rmm::exec_policy(stream), + thrust::make_constant_iterator(1, 1), + thrust::make_constant_iterator( + 1, static_cast(thrust::distance(offsets_begin, offsets_end)) - 1), + offsets_begin + 1, + out_begin); + thrust::inclusive_scan(rmm::exec_policy(stream), out_begin, out_end, out_begin); } -// Max aggregation/scan technically has the same performance as min. -template -void BM_groupby_min_struct(benchmark::State& state) +//================================================================================================== +template +void new_way(InputIterator offsets_begin, + InputIterator offsets_end, + OutputIterator out_begin, + OutputIterator out_end, + rmm::cuda_stream_view stream) { - auto const n_rows = static_cast(state.range(0)); - auto data_cols = create_data_table(n_rows)->release(); + auto const zero_normalized_offsets = thrust::make_transform_iterator( + offsets_begin, [offsets_begin] __device__(auto const idx) { return idx - *offsets_begin; }); + + // The output labels from `upper_bound` will start from `1`. + // This will shift the result values back to start from `0`. + using OutputType = typename thrust::iterator_value::type; + auto const output = thrust::make_transform_output_iterator( + out_begin, [] __device__(auto const idx) { return idx - OutputType{1}; }); - auto const keys_view = data_cols.front()->view(); - auto const values = - cudf::make_structs_column(keys_view.size(), std::move(data_cols), 0, rmm::device_buffer()); + thrust::upper_bound(rmm::exec_policy(stream), + zero_normalized_offsets, + zero_normalized_offsets + thrust::distance(offsets_begin, offsets_end), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator( + static_cast(thrust::distance(out_begin, out_end))), + output); +} - using RequestType = std::conditional_t, - cudf::groupby::aggregation_request, - cudf::groupby::scan_request>; +//================================================================================================== +template +void BM_labeling(benchmark::State& state) +{ + auto const n_groups = static_cast(state.range(0)); + auto const stream = rmm::cuda_stream_default; - auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view})); - auto requests = std::vector(); - requests.emplace_back(RequestType()); - requests.front().values = values->view(); - requests.front().aggregations.push_back(cudf::make_min_aggregation()); + auto const [offsets, n_labels] = create_offsets(n_groups, stream); + auto const offsets_view = offsets->view(); + auto labels = rmm::device_uvector(n_labels, stream); for (auto _ : state) { [[maybe_unused]] auto const timer = cuda_event_timer(state, true); - if constexpr (std::is_same_v) { - [[maybe_unused]] auto const result = gb_obj.aggregate(requests); + if constexpr (use_old) { + old_way(offsets_view.template begin(), + offsets_view.template end(), + labels.begin(), + labels.end(), + stream); } else { - [[maybe_unused]] auto const result = gb_obj.scan(requests); + new_way(offsets_view.template begin(), + offsets_view.template end(), + labels.begin(), + labels.end(), + stream); } } } -class Groupby : public cudf::benchmark { +//================================================================================================== +class Labeling : public cudf::benchmark { }; -#define MIN_RANGE 10'000 -#define MAX_RANGE 10'000'000 - -#define REGISTER_BENCHMARK(name, op_type) \ - BENCHMARK_DEFINE_F(Groupby, name)(::benchmark::State & state) \ - { \ - BM_groupby_min_struct(state); \ - } \ - BENCHMARK_REGISTER_F(Groupby, name) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond) \ - ->RangeMultiplier(4) \ +#define MIN_RANGE 1'000 +#define MAX_RANGE 4'200'000 + +#define REGISTER_BENCHMARK(name, use_old) \ + BENCHMARK_DEFINE_F(Labeling, name)(::benchmark::State & state) { BM_labeling(state); } \ + BENCHMARK_REGISTER_F(Labeling, name) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond) \ + ->RangeMultiplier(4) \ ->Ranges({{MIN_RANGE, MAX_RANGE}}); -REGISTER_BENCHMARK(Aggregation, cudf::groupby_aggregation) -REGISTER_BENCHMARK(Scan, cudf::groupby_scan_aggregation) +REGISTER_BENCHMARK(LabelingOldWay, true) +// REGISTER_BENCHMARK(LabelingNewWay, false) From 28e14634f94d79cfd83c56a0f29a5adfad443732 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 May 2022 20:04:34 -0700 Subject: [PATCH 12/51] Rewrite `label_segments` --- .../cudf/detail/labeling/label_segments.cuh | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 20ad6b565ba..bc50e14c0b7 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -19,11 +19,10 @@ #include #include -#include #include -#include -#include -#include +#include +#include +#include namespace cudf::detail { @@ -36,12 +35,13 @@ namespace cudf::detail { * `offsets[i+1] - offsets[i]`. * * The labels always start from `0` regardless of the offset values. + * In case there are empty segments, their corresponding label values will be skipped in the output. * * @code{.pseudo} * Examples: * - * offsets = { 0, 4, 6, 10 } - * output = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 2 } + * offsets = { 0, 4, 6, 6, 6, 10 } + * output = { 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 } * * offsets = { 5, 10, 12 } * output = { 0, 0, 0, 0, 0, 1, 1 } @@ -60,22 +60,25 @@ void label_segments(InputIterator offsets_begin, OutputIterator out_end, rmm::cuda_stream_view stream) { - auto const zero_normalized_offsets = thrust::make_transform_iterator( - offsets_begin, [offsets_begin] __device__(auto const idx) { return idx - *offsets_begin; }); + auto const num_segments = + static_cast(thrust::distance(offsets_begin, offsets_end)) - 1; + if (num_segments <= 0) { return; } - // The output labels from `upper_bound` will start from `1`. - // This will shift the result values back to start from `0`. - using OutputType = typename thrust::iterator_value::type; - auto const output = thrust::make_transform_output_iterator( - out_begin, [] __device__(auto const idx) { return idx - OutputType{1}; }); + using OutputType = typename thrust::iterator_value::type; + thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(size_type{1}), + thrust::make_counting_iterator(num_segments), + [offsets, out_begin] __device__(auto const idx) { + auto const dst_idx = offsets[idx] - offsets[0]; - thrust::upper_bound(rmm::exec_policy(stream), - zero_normalized_offsets, - zero_normalized_offsets + thrust::distance(offsets_begin, offsets_end), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator( - static_cast(thrust::distance(out_begin, out_end))), - output); + // Scatter value `1` to the index at offsets[idx]. + // In case we have repeated offsets (i.e., we have empty segments), this + // atomicAdd call will make sure the label values corresponding to these empty + // segments will be skipped in the output. + atomicAdd(&out_begin[dst_idx], OutputType{1}); + }); + thrust::inclusive_scan(rmm::exec_policy(stream), out_begin, out_end, out_begin); } } // namespace cudf::detail From d3708a5d5708046e16368ab8f0d2c5f80b46c33f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 May 2022 20:44:34 -0700 Subject: [PATCH 13/51] Fix compile error --- cpp/include/cudf/detail/labeling/label_segments.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index bc50e14c0b7..2758d7e82cd 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -19,9 +19,9 @@ #include #include +#include #include #include -#include #include namespace cudf::detail { @@ -69,14 +69,14 @@ void label_segments(InputIterator offsets_begin, thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(size_type{1}), thrust::make_counting_iterator(num_segments), - [offsets, out_begin] __device__(auto const idx) { + [offsets = offsets_begin, output = out_begin] __device__(auto const idx) { auto const dst_idx = offsets[idx] - offsets[0]; // Scatter value `1` to the index at offsets[idx]. // In case we have repeated offsets (i.e., we have empty segments), this // atomicAdd call will make sure the label values corresponding to these empty // segments will be skipped in the output. - atomicAdd(&out_begin[dst_idx], OutputType{1}); + atomicAdd(&output[dst_idx], OutputType{1}); }); thrust::inclusive_scan(rmm::exec_policy(stream), out_begin, out_end, out_begin); } From 60259c99a606e81584ec9649d032c1f5b845d932 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 May 2022 20:47:48 -0700 Subject: [PATCH 14/51] Hack to test --- cpp/benchmarks/CMakeLists.txt | 6 --- cpp/benchmarks/groupby/group_struct.cu | 60 ++++++++++++++++---------- cpp/tests/CMakeLists.txt | 9 ---- 3 files changed, 37 insertions(+), 38 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 04dcf51dd40..1cfefec2713 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -195,13 +195,7 @@ ConfigureBench(FILL_BENCH filling/repeat.cpp) # * groupby benchmark ----------------------------------------------------------------------------- ConfigureBench( GROUPBY_BENCH - groupby/group_sum.cu - groupby/group_nth.cu - groupby/group_shift.cu groupby/group_struct.cu - groupby/group_no_requests.cu - groupby/group_scan.cu - groupby/group_rank_benchmark.cu ) ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu) diff --git a/cpp/benchmarks/groupby/group_struct.cu b/cpp/benchmarks/groupby/group_struct.cu index 6c640c64050..2eb3b33955a 100644 --- a/cpp/benchmarks/groupby/group_struct.cu +++ b/cpp/benchmarks/groupby/group_struct.cu @@ -1,19 +1,3 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - #include #include #include @@ -21,6 +5,7 @@ #include #include +#include #include #include #include @@ -108,6 +93,35 @@ void new_way(InputIterator offsets_begin, output); } +//================================================================================================== +template +void new_way_v2(InputIterator offsets_begin, + InputIterator offsets_end, + OutputIterator out_begin, + OutputIterator out_end, + rmm::cuda_stream_view stream) +{ + auto const num_segments = + static_cast(thrust::distance(offsets_begin, offsets_end)) - 1; + if (num_segments <= 0) { return; } + + using OutputType = typename thrust::iterator_value::type; + thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(cudf::size_type{1}), + thrust::make_counting_iterator(num_segments), + [offsets = offsets_begin, output = out_begin] __device__(auto const idx) { + auto const dst_idx = offsets[idx] - offsets[0]; + + // Scatter value `1` to the index at offsets[idx]. + // In case we have repeated offsets (i.e., we have empty segments), this + // atomicAdd call will make sure the label values corresponding to these empty + // segments will be skipped in the output. + atomicAdd(&output[dst_idx], OutputType{1}); + }); + thrust::inclusive_scan(rmm::exec_policy(stream), out_begin, out_end, out_begin); +} + //================================================================================================== template void BM_labeling(benchmark::State& state) @@ -128,11 +142,11 @@ void BM_labeling(benchmark::State& state) labels.end(), stream); } else { - new_way(offsets_view.template begin(), - offsets_view.template end(), - labels.begin(), - labels.end(), - stream); + new_way_v2(offsets_view.template begin(), + offsets_view.template end(), + labels.begin(), + labels.end(), + stream); } } } @@ -152,5 +166,5 @@ class Labeling : public cudf::benchmark { ->RangeMultiplier(4) \ ->Ranges({{MIN_RANGE, MAX_RANGE}}); -REGISTER_BENCHMARK(LabelingOldWay, true) -// REGISTER_BENCHMARK(LabelingNewWay, false) +// REGISTER_BENCHMARK(LabelingOldWay, true) +REGISTER_BENCHMARK(LabelingNewWay, false) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index eadcd985de3..92751d725b3 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -470,16 +470,7 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp) # * lists tests ---------------------------------------------------------------------------------- ConfigureTest( LISTS_TEST - lists/apply_boolean_mask_test.cpp - lists/combine/concatenate_list_elements_tests.cpp - lists/combine/concatenate_rows_tests.cpp - lists/contains_tests.cpp - lists/count_elements_tests.cpp lists/drop_list_duplicates_tests.cpp - lists/explode_tests.cpp - lists/extract_tests.cpp - lists/sequences_tests.cpp - lists/sort_lists_tests.cpp ) # ################################################################################################## From 97b9a5736144cc3b3c091072d79daea9ce183e9a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 May 2022 20:48:18 -0700 Subject: [PATCH 15/51] Revert "Add a benchmark" This reverts commit 031302ba1f695e7682b40579e5465a4bf3d6a6b6. # Conflicts: # cpp/benchmarks/groupby/group_struct.cu --- cpp/benchmarks/groupby/group_struct.cu | 213 ++++++++----------------- 1 file changed, 69 insertions(+), 144 deletions(-) diff --git a/cpp/benchmarks/groupby/group_struct.cu b/cpp/benchmarks/groupby/group_struct.cu index 2eb3b33955a..c5eceda2df2 100644 --- a/cpp/benchmarks/groupby/group_struct.cu +++ b/cpp/benchmarks/groupby/group_struct.cu @@ -1,170 +1,95 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include #include #include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include -#include -#include +static constexpr cudf::size_type num_struct_members = 8; +static constexpr cudf::size_type max_int = 100; +static constexpr cudf::size_type max_str_length = 32; -//================================================================================================== -auto create_offsets(cudf::size_type n_groups, rmm::cuda_stream_view stream) +static auto create_data_table(cudf::size_type n_rows) { - // This is the maximum size of each group. - constexpr cudf::size_type max_int = 1000; - - auto table_profile = data_profile{}; + data_profile table_profile; table_profile.set_distribution_params(cudf::type_id::INT32, distribution_id::UNIFORM, 0, max_int); - auto sizes = - std::move(create_random_table({cudf::type_id::INT32}, row_count{n_groups}, table_profile) - ->release() - .front()); - auto const sizes_view = sizes->mutable_view(); - - thrust::exclusive_scan(rmm::exec_policy(), - sizes_view.template begin(), - sizes_view.template end(), - sizes_view.template begin()); - - cudf::size_type n_elements; - CUDF_CUDA_TRY(cudaMemcpyAsync(&n_elements, - sizes_view.template end() - 1, - sizeof(cudf::size_type), - cudaMemcpyDeviceToHost, - stream.value())); - stream.synchronize(); - - return std::pair{std::move(sizes), n_elements}; -} - -//================================================================================================== -template -void old_way(InputIterator offsets_begin, - InputIterator offsets_end, - OutputIterator out_begin, - OutputIterator out_end, - rmm::cuda_stream_view stream) -{ - thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, cudf::size_type{0}); - thrust::scatter( - rmm::exec_policy(stream), - thrust::make_constant_iterator(1, 1), - thrust::make_constant_iterator( - 1, static_cast(thrust::distance(offsets_begin, offsets_end)) - 1), - offsets_begin + 1, - out_begin); - thrust::inclusive_scan(rmm::exec_policy(stream), out_begin, out_end, out_begin); + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + + // The first two struct members are int32 and string. + // The first column is also used as keys in groupby. + // The subsequent struct members are int32 and string again. + return create_random_table( + cycle_dtypes({cudf::type_id::INT32, cudf::type_id::STRING}, num_struct_members), + row_count{n_rows}, + table_profile); } -//================================================================================================== -template -void new_way(InputIterator offsets_begin, - InputIterator offsets_end, - OutputIterator out_begin, - OutputIterator out_end, - rmm::cuda_stream_view stream) +// Max aggregation/scan technically has the same performance as min. +template +void BM_groupby_min_struct(benchmark::State& state) { - auto const zero_normalized_offsets = thrust::make_transform_iterator( - offsets_begin, [offsets_begin] __device__(auto const idx) { return idx - *offsets_begin; }); + auto const n_rows = static_cast(state.range(0)); + auto data_cols = create_data_table(n_rows)->release(); - // The output labels from `upper_bound` will start from `1`. - // This will shift the result values back to start from `0`. - using OutputType = typename thrust::iterator_value::type; - auto const output = thrust::make_transform_output_iterator( - out_begin, [] __device__(auto const idx) { return idx - OutputType{1}; }); + auto const keys_view = data_cols.front()->view(); + auto const values = + cudf::make_structs_column(keys_view.size(), std::move(data_cols), 0, rmm::device_buffer()); - thrust::upper_bound(rmm::exec_policy(stream), - zero_normalized_offsets, - zero_normalized_offsets + thrust::distance(offsets_begin, offsets_end), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator( - static_cast(thrust::distance(out_begin, out_end))), - output); -} - -//================================================================================================== -template -void new_way_v2(InputIterator offsets_begin, - InputIterator offsets_end, - OutputIterator out_begin, - OutputIterator out_end, - rmm::cuda_stream_view stream) -{ - auto const num_segments = - static_cast(thrust::distance(offsets_begin, offsets_end)) - 1; - if (num_segments <= 0) { return; } + using RequestType = std::conditional_t, + cudf::groupby::aggregation_request, + cudf::groupby::scan_request>; - using OutputType = typename thrust::iterator_value::type; - thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); - thrust::for_each(rmm::exec_policy(stream), - thrust::make_counting_iterator(cudf::size_type{1}), - thrust::make_counting_iterator(num_segments), - [offsets = offsets_begin, output = out_begin] __device__(auto const idx) { - auto const dst_idx = offsets[idx] - offsets[0]; - - // Scatter value `1` to the index at offsets[idx]. - // In case we have repeated offsets (i.e., we have empty segments), this - // atomicAdd call will make sure the label values corresponding to these empty - // segments will be skipped in the output. - atomicAdd(&output[dst_idx], OutputType{1}); - }); - thrust::inclusive_scan(rmm::exec_policy(stream), out_begin, out_end, out_begin); -} - -//================================================================================================== -template -void BM_labeling(benchmark::State& state) -{ - auto const n_groups = static_cast(state.range(0)); - auto const stream = rmm::cuda_stream_default; - - auto const [offsets, n_labels] = create_offsets(n_groups, stream); - auto const offsets_view = offsets->view(); - auto labels = rmm::device_uvector(n_labels, stream); + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view})); + auto requests = std::vector(); + requests.emplace_back(RequestType()); + requests.front().values = values->view(); + requests.front().aggregations.push_back(cudf::make_min_aggregation()); for (auto _ : state) { [[maybe_unused]] auto const timer = cuda_event_timer(state, true); - if constexpr (use_old) { - old_way(offsets_view.template begin(), - offsets_view.template end(), - labels.begin(), - labels.end(), - stream); + if constexpr (std::is_same_v) { + [[maybe_unused]] auto const result = gb_obj.aggregate(requests); } else { - new_way_v2(offsets_view.template begin(), - offsets_view.template end(), - labels.begin(), - labels.end(), - stream); + [[maybe_unused]] auto const result = gb_obj.scan(requests); } } } -//================================================================================================== -class Labeling : public cudf::benchmark { +class Groupby : public cudf::benchmark { }; -#define MIN_RANGE 1'000 -#define MAX_RANGE 4'200'000 - -#define REGISTER_BENCHMARK(name, use_old) \ - BENCHMARK_DEFINE_F(Labeling, name)(::benchmark::State & state) { BM_labeling(state); } \ - BENCHMARK_REGISTER_F(Labeling, name) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond) \ - ->RangeMultiplier(4) \ +#define MIN_RANGE 10'000 +#define MAX_RANGE 10'000'000 + +#define REGISTER_BENCHMARK(name, op_type) \ + BENCHMARK_DEFINE_F(Groupby, name)(::benchmark::State & state) \ + { \ + BM_groupby_min_struct(state); \ + } \ + BENCHMARK_REGISTER_F(Groupby, name) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond) \ + ->RangeMultiplier(4) \ ->Ranges({{MIN_RANGE, MAX_RANGE}}); -// REGISTER_BENCHMARK(LabelingOldWay, true) -REGISTER_BENCHMARK(LabelingNewWay, false) +REGISTER_BENCHMARK(Aggregation, cudf::groupby_aggregation) +REGISTER_BENCHMARK(Scan, cudf::groupby_scan_aggregation) From a094dab6a2724e92ca50f8271c3ede41b1bc01a8 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 25 May 2022 20:48:58 -0700 Subject: [PATCH 16/51] Revert "Hack to test" This reverts commit 60259c99a606e81584ec9649d032c1f5b845d932. # Conflicts: # cpp/benchmarks/groupby/group_struct.cu --- cpp/benchmarks/CMakeLists.txt | 6 ++++++ cpp/tests/CMakeLists.txt | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 1cfefec2713..04dcf51dd40 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -195,7 +195,13 @@ ConfigureBench(FILL_BENCH filling/repeat.cpp) # * groupby benchmark ----------------------------------------------------------------------------- ConfigureBench( GROUPBY_BENCH + groupby/group_sum.cu + groupby/group_nth.cu + groupby/group_shift.cu groupby/group_struct.cu + groupby/group_no_requests.cu + groupby/group_scan.cu + groupby/group_rank_benchmark.cu ) ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 92751d725b3..eadcd985de3 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -470,7 +470,16 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp) # * lists tests ---------------------------------------------------------------------------------- ConfigureTest( LISTS_TEST + lists/apply_boolean_mask_test.cpp + lists/combine/concatenate_list_elements_tests.cpp + lists/combine/concatenate_rows_tests.cpp + lists/contains_tests.cpp + lists/count_elements_tests.cpp lists/drop_list_duplicates_tests.cpp + lists/explode_tests.cpp + lists/extract_tests.cpp + lists/sequences_tests.cpp + lists/sort_lists_tests.cpp ) # ################################################################################################## From f5a552009dac707e041ee622710dfa7201bb6d6b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 May 2022 08:36:03 -0700 Subject: [PATCH 17/51] Add comment --- cpp/include/cudf/detail/labeling/label_segments.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 2758d7e82cd..1005bfd4033 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -70,6 +70,7 @@ void label_segments(InputIterator offsets_begin, thrust::make_counting_iterator(size_type{1}), thrust::make_counting_iterator(num_segments), [offsets = offsets_begin, output = out_begin] __device__(auto const idx) { + // Zero-normalized offsets. auto const dst_idx = offsets[idx] - offsets[0]; // Scatter value `1` to the index at offsets[idx]. From a060e3d2a8d883b1e610c094567325ac1915e8ce Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 May 2022 10:36:46 -0700 Subject: [PATCH 18/51] Add comment clarifying bound check --- cpp/include/cudf/detail/labeling/label_segments.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 1005bfd4033..42794cbc7e9 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -37,6 +37,10 @@ namespace cudf::detail { * The labels always start from `0` regardless of the offset values. * In case there are empty segments, their corresponding label values will be skipped in the output. * + * Note that the caller is responsible to make sure the output range have the correct size, which is + * the total segment sizes (i.e., `size = *(offsets_end - 1) - *offsets_begin`). Otherwise, the + * result is undefined. + * * @code{.pseudo} * Examples: * From 6ac3f49f8814f4b458a4e635ffd701c10b2c1325 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 May 2022 11:27:11 -0700 Subject: [PATCH 19/51] Implement `labels_to_offsets` --- .../cudf/detail/labeling/label_segments.cuh | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 1005bfd4033..e5ee4404f16 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -17,10 +17,13 @@ #include #include +#include #include #include +#include #include +#include #include #include @@ -82,4 +85,84 @@ void label_segments(InputIterator offsets_begin, thrust::inclusive_scan(rmm::exec_policy(stream), out_begin, out_end, out_begin); } +/** + * @brief Generate segment offsets from groups of identical label values. + * + * Given a pair of iterators accessing to an array containing groups of identical label values, + * generate offsets for segments defined by these label. + * + * Empty segments are also taken into account. If the input label values are discontinuous, the + * segments corresponding to the missing labels will be inferred as empty segments and their offsets + * will also be generated. + * + * Note that the caller is responsible to make sure the output range for offsets have the correct + * size, which is the maximum label value plus two (i.e., `size = *(labels_end - 1) + 2`). + * Otherwise, the result is undefined. + * + * @code{.pseudo} + * Examples: + * + * labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ] + * offsets = [ 0, 4, 6, 6, 6, 10 ] + * + * labels = [ 0, 0, 0, 0, 0, 1, 1 ] + * offsets = [ 0, 5, 7 ] + * @endcode + * + * @param labels_begin The beginning of the labels that define segments. + * @param labels_end The end of the labels that define segments. + * @param out_begin The beginning of the output offset range. + * @param out_end The end of the output offset range. + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +template +void labels_to_offsets(InputIterator labels_begin, + InputIterator labels_end, + OutputIterator out_begin, + OutputIterator out_end, + rmm::cuda_stream_view stream) +{ + auto const num_labels = static_cast(thrust::distance(labels_begin, labels_end)); + if (num_labels == 0) { return; } + + auto const num_segments = static_cast(thrust::distance(out_begin, out_end)) - 1; + + //========================================================================= + // Let consider an example: input_labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ]. + + // This stores the unique label values. + // Given the example above, we will have this array containing [0, 1, 4]. + auto list_indices = rmm::device_uvector(num_segments, stream); + + // Stores the non-zero segment sizes. + // Given the example above, we will have this array containing [4, 2, 4] + auto list_sizes = rmm::device_uvector(num_segments, stream); + + // Count the numbers of unique labels in the input. + auto const end = thrust::reduce_by_key(rmm::exec_policy(stream), + labels_begin, // keys + labels_end, // keys + thrust::make_constant_iterator(1), + list_indices.begin(), // output unique keys + list_sizes.begin()); // count for each key + auto const num_non_empty_segments = thrust::distance(list_indices.begin(), end.first); + + // The output offsets need to be filled with `0` value first. + using OutputType = typename thrust::iterator_value::type; + thrust::uninitialized_fill_n(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); + + // Scatter segment sizes into the end position of their corresponding segment indices. + // Given the example above, we scatter [4, 2, 4] by the scatter_map [0, 1, 4], resulting + // output = [4, 2, 0, 0, 4, 0]. + thrust::scatter(rmm::exec_policy(stream), + list_sizes.begin(), + list_sizes.begin() + num_non_empty_segments, + list_indices.begin(), + out_begin); + + // Generate offsets from sizes. + // Given the example above, the final output is [0, 4, 6, 6, 6, 10]. + thrust::exclusive_scan(rmm::exec_policy(stream), out_begin, out_end, out_begin); +} + } // namespace cudf::detail From b8cb36374e2458a17c6314e3a78a383695c8d877 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 May 2022 11:27:35 -0700 Subject: [PATCH 20/51] Rewrite example --- cpp/include/cudf/detail/labeling/label_segments.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 42794cbc7e9..d4c4764e66c 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -44,11 +44,11 @@ namespace cudf::detail { * @code{.pseudo} * Examples: * - * offsets = { 0, 4, 6, 6, 6, 10 } - * output = { 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 } + * offsets = [ 0, 4, 6, 6, 6, 10 ] + * output = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ] * - * offsets = { 5, 10, 12 } - * output = { 0, 0, 0, 0, 0, 1, 1 } + * offsets = [ 5, 10, 12 ] + * output = [ 0, 0, 0, 0, 0, 1, 1 ] * @endcode * * @param offsets_begin The beginning of the offsets that define segments. From 9bdfa6cb14f61205ba986e5d2af53c4d9ae012b1 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 May 2022 12:31:27 -0700 Subject: [PATCH 21/51] Fix initialization --- cpp/include/cudf/detail/labeling/label_segments.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 627ce2a11d8..f59ff8dcbc0 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -126,6 +126,10 @@ void labels_to_offsets(InputIterator labels_begin, OutputIterator out_end, rmm::cuda_stream_view stream) { + // The output offsets need to be filled with `0` value first. + using OutputType = typename thrust::iterator_value::type; + thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); + auto const num_labels = static_cast(thrust::distance(labels_begin, labels_end)); if (num_labels == 0) { return; } @@ -151,10 +155,6 @@ void labels_to_offsets(InputIterator labels_begin, list_sizes.begin()); // count for each key auto const num_non_empty_segments = thrust::distance(list_indices.begin(), end.first); - // The output offsets need to be filled with `0` value first. - using OutputType = typename thrust::iterator_value::type; - thrust::uninitialized_fill_n(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); - // Scatter segment sizes into the end position of their corresponding segment indices. // Given the example above, we scatter [4, 2, 4] by the scatter_map [0, 1, 4], resulting // output = [4, 2, 0, 0, 4, 0]. From baa303fc642c728cfccb91216b3174fee4ee3909 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 May 2022 12:33:09 -0700 Subject: [PATCH 22/51] Use `label_to_offsets` --- cpp/src/lists/drop_list_duplicates.cu | 89 ++++++--------------------- 1 file changed, 18 insertions(+), 71 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 49e0e944fb8..70ae31c73b9 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -343,11 +343,11 @@ struct get_indices_of_unique_entries_dispatch { { auto const d_view = column_device_view::create(all_lists_entries, stream); auto const comp = column_row_comparator_fn{list_indices, - *d_view, - *d_view, - nulls_equal, - all_lists_entries.has_nulls(), - nans_equal == nan_equality::ALL_EQUAL}; + *d_view, + *d_view, + nulls_equal, + all_lists_entries.has_nulls(), + nans_equal == nan_equality::ALL_EQUAL}; return cudf::detail::unique_copy(thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_entries), output_begin, @@ -438,67 +438,6 @@ std::vector> get_unique_entries_and_list_indices( ->release(); } -/** - * @brief Generate list offsets from entry list indices for the final result lists column(s). - * - * @param num_lists The number of lists. - * @param entries_list_indices The mapping from list entries to their (1-based) list indices. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device resource used to allocate memory. - */ -std::unique_ptr generate_output_offsets(size_type num_lists, - column_view const& entries_list_indices, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Let consider an example: - // Given the input lists column with offsets are [0, 4, 7, 7, 10], num_lists is 4, and - // entries_list_indices is [0, 0, 0, 0, 1, 1, 1, 3, 3, 3]. - // After extracting unique entries we have the entries_list_indices becomes - // [0, 0, 1, 3, 3]. These are the input to this function. - - // This stores the unique list indices of unique entries (i.e., at max one list index per list). - // Given the example above, we will have this array hold the values [0, 1, 3]. - auto list_indices = rmm::device_uvector(num_lists, stream); - - // Stores the non-zero numbers of unique entries per list. - // Given the example above, we will have this array contains the values [2, 1, 2] - auto list_sizes = rmm::device_uvector(num_lists, stream); - - // Count the numbers of unique entries for each non-empty list. - auto const end = thrust::reduce_by_key(rmm::exec_policy(stream), - entries_list_indices.template begin(), - entries_list_indices.template end(), - thrust::make_constant_iterator(1), - list_indices.begin(), - list_sizes.begin()); - auto const num_non_empty_lists = thrust::distance(list_indices.begin(), end.first); - - // The output offsets for the output lists column(s). - auto new_offsets = rmm::device_uvector(num_lists + 1, stream, mr); - - // The new offsets need to be filled with 0 value first. - thrust::uninitialized_fill_n( - rmm::exec_policy(stream), new_offsets.begin(), num_lists + 1, offset_type{0}); - - // Scatter non-zero sizes of the output lists into the correct positions. - // Given the example above, we scatter [2, 1, 2] by the scatter_map [0, 1, 3] and will have - // new_offsets = [2, 1, 0, 2, 0] - thrust::scatter(rmm::exec_policy(stream), - list_sizes.begin(), - list_sizes.begin() + num_non_empty_lists, - list_indices.begin(), - new_offsets.begin()); - - // Generate offsets from sizes. - // Given the example above, we will have new_offsets = [0, 2, 3, 3, 5] - thrust::exclusive_scan( - rmm::exec_policy(stream), new_offsets.begin(), new_offsets.end(), new_offsets.begin()); - - return std::make_unique( - data_type{type_to_id()}, num_lists + 1, new_offsets.release()); -} - /** * @brief Common execution code called by all public `drop_list_duplicates` APIs. */ @@ -591,11 +530,19 @@ std::pair, std::unique_ptr> drop_list_duplicates mr); // Generate offsets for the output lists column(s). - auto output_offsets = generate_output_offsets( - keys.size(), - unique_entries_and_list_indices.back()->view(), // unique entries' list indices - stream, - mr); + auto output_offsets = [&] { + auto out_offsets = make_numeric_column( + data_type{type_to_id()}, keys.size() + 1, mask_state::UNALLOCATED, stream, mr); + auto const offsets = out_offsets->mutable_view(); + auto const labels = + unique_entries_and_list_indices.back()->view(); // unique entries' list indices + cudf::detail::labels_to_offsets(labels.template begin(), + labels.template end(), + offsets.template begin(), + offsets.template end(), + stream); + return out_offsets; + }(); // If the values lists column is not given, its corresponding output will be nullptr. auto out_values = From 039b92fbe930fd28bbb93049ddfbb6e7d5af15b3 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 May 2022 12:37:45 -0700 Subject: [PATCH 23/51] Change variable name --- cpp/include/cudf/detail/labeling/label_segments.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index f59ff8dcbc0..646305e648a 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -106,11 +106,11 @@ void label_segments(InputIterator offsets_begin, * @code{.pseudo} * Examples: * - * labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ] - * offsets = [ 0, 4, 6, 6, 6, 10 ] + * labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ] + * output = [ 0, 4, 6, 6, 6, 10 ] * - * labels = [ 0, 0, 0, 0, 0, 1, 1 ] - * offsets = [ 0, 5, 7 ] + * labels = [ 0, 0, 0, 0, 0, 1, 1 ] + * output = [ 0, 5, 7 ] * @endcode * * @param labels_begin The beginning of the labels that define segments. From ab1e25ade59d67ff2dddce72e803ed7cc12562af Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 May 2022 12:43:02 -0700 Subject: [PATCH 24/51] Reverse comments. They will be removed completely later on so don't change them now. --- cpp/src/lists/drop_list_duplicates.cu | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 49e0e944fb8..64d2eec2a9f 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -452,17 +452,22 @@ std::unique_ptr generate_output_offsets(size_type num_lists, rmm::mr::device_memory_resource* mr) { // Let consider an example: - // Given the input lists column with offsets are [0, 4, 7, 7, 10], num_lists is 4, and - // entries_list_indices is [0, 0, 0, 0, 1, 1, 1, 3, 3, 3]. - // After extracting unique entries we have the entries_list_indices becomes - // [0, 0, 1, 3, 3]. These are the input to this function. - - // This stores the unique list indices of unique entries (i.e., at max one list index per list). - // Given the example above, we will have this array hold the values [0, 1, 3]. + // Given the original offsets of the input lists column is [0, 4, 5, 6, 7, 10, 11, 13]. + // The original entries_list_indices is [1, 1, 1, 1, 2, 3, 4, 5, 5, 5, 6, 7, 7], and after + // extracting unique entries we have the entries_list_indices becomes [1, 1, 1, 4, 5, 5, 5, 7, 7] + // and num_lists is 7. These are the input to this function. + // + // Through extracting unique list entries, one entry in the list index 1 has been removed (first + // list, as we are using 1-based list index), and entries in the lists with indices {3, 3, 6} have + // been removed completely. + + // This variable stores the (1-based) list indices of the unique entries but only one index value + // per non-empty list. Given the example above, we will have this array hold the values + // [1, 4, 5, 7]. auto list_indices = rmm::device_uvector(num_lists, stream); // Stores the non-zero numbers of unique entries per list. - // Given the example above, we will have this array contains the values [2, 1, 2] + // Given the example above, we will have this array contains the values [3, 1, 3, 2] auto list_sizes = rmm::device_uvector(num_lists, stream); // Count the numbers of unique entries for each non-empty list. @@ -482,8 +487,7 @@ std::unique_ptr generate_output_offsets(size_type num_lists, rmm::exec_policy(stream), new_offsets.begin(), num_lists + 1, offset_type{0}); // Scatter non-zero sizes of the output lists into the correct positions. - // Given the example above, we scatter [2, 1, 2] by the scatter_map [0, 1, 3] and will have - // new_offsets = [2, 1, 0, 2, 0] + // Given the example above, we will have new_offsets = [0, 3, 0, 0, 1, 3, 0, 2] thrust::scatter(rmm::exec_policy(stream), list_sizes.begin(), list_sizes.begin() + num_non_empty_lists, @@ -491,10 +495,11 @@ std::unique_ptr generate_output_offsets(size_type num_lists, new_offsets.begin()); // Generate offsets from sizes. - // Given the example above, we will have new_offsets = [0, 2, 3, 3, 5] + // Given the example above, we will have new_offsets = [0, 3, 3, 3, 4, 7, 7, 9] thrust::exclusive_scan( rmm::exec_policy(stream), new_offsets.begin(), new_offsets.end(), new_offsets.begin()); + // Done. Hope that your head didn't explode after reading till this point. return std::make_unique( data_type{type_to_id()}, num_lists + 1, new_offsets.release()); } From f9e081960008f1da37e4a1a15fe38f420f3b4340 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 May 2022 13:04:51 -0700 Subject: [PATCH 25/51] Remove unused headers --- cpp/src/lists/drop_list_duplicates.cu | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 70ae31c73b9..32b84845ed1 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -40,14 +40,7 @@ #include #include #include -#include #include -#include -#include -#include -#include -#include -#include #include @@ -343,11 +336,11 @@ struct get_indices_of_unique_entries_dispatch { { auto const d_view = column_device_view::create(all_lists_entries, stream); auto const comp = column_row_comparator_fn{list_indices, - *d_view, - *d_view, - nulls_equal, - all_lists_entries.has_nulls(), - nans_equal == nan_equality::ALL_EQUAL}; + *d_view, + *d_view, + nulls_equal, + all_lists_entries.has_nulls(), + nans_equal == nan_equality::ALL_EQUAL}; return cudf::detail::unique_copy(thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_entries), output_begin, From 52f6c3003dbfca63fb8b79c70f026a64e6aef0f8 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 26 May 2022 15:08:22 -0700 Subject: [PATCH 26/51] Rewrite example --- cpp/include/cudf/detail/labeling/label_segments.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 646305e648a..5ba8521c1fa 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -136,7 +136,7 @@ void labels_to_offsets(InputIterator labels_begin, auto const num_segments = static_cast(thrust::distance(out_begin, out_end)) - 1; //========================================================================= - // Let consider an example: input_labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ]. + // Let consider an example: Given input labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ]. // This stores the unique label values. // Given the example above, we will have this array containing [0, 1, 4]. From 79e2f6fa612b9c2804e07b5193a4528de65ca501 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 27 May 2022 14:07:32 -0700 Subject: [PATCH 27/51] Rewrite comments --- cpp/include/cudf/detail/labeling/label_segments.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 5ba8521c1fa..bd9f18eb95e 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -135,7 +135,7 @@ void labels_to_offsets(InputIterator labels_begin, auto const num_segments = static_cast(thrust::distance(out_begin, out_end)) - 1; - //========================================================================= + //================================================================================ // Let consider an example: Given input labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ]. // This stores the unique label values. @@ -143,7 +143,7 @@ void labels_to_offsets(InputIterator labels_begin, auto list_indices = rmm::device_uvector(num_segments, stream); // Stores the non-zero segment sizes. - // Given the example above, we will have this array containing [4, 2, 4] + // Given the example above, we will have this array containing [4, 2, 4]. auto list_sizes = rmm::device_uvector(num_segments, stream); // Count the numbers of unique labels in the input. @@ -151,8 +151,8 @@ void labels_to_offsets(InputIterator labels_begin, labels_begin, // keys labels_end, // keys thrust::make_constant_iterator(1), - list_indices.begin(), // output unique keys - list_sizes.begin()); // count for each key + list_indices.begin(), // output unique input labels + list_sizes.begin()); // count for each label auto const num_non_empty_segments = thrust::distance(list_indices.begin(), end.first); // Scatter segment sizes into the end position of their corresponding segment indices. From 6eec56ac39f2db87cc35f6cf07b3b97d22b05d02 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 27 May 2022 14:40:52 -0700 Subject: [PATCH 28/51] Cleanup headers --- cpp/src/lists/drop_list_duplicates.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 32b84845ed1..c04cc71c7db 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -36,9 +36,9 @@ #include #include -#include #include #include +#include #include #include From ba91075ee165b70224822e904ad941c2c760498d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 27 May 2022 14:46:24 -0700 Subject: [PATCH 29/51] Cleanup headers Signed-off-by: Nghia Truong --- cpp/include/cudf/detail/labeling/label_segments.cuh | 1 + cpp/src/lists/drop_list_duplicates.cu | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index d4c4764e66c..77e9278ed59 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -19,6 +19,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 64d2eec2a9f..929af866b6d 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -36,17 +36,15 @@ #include #include -#include #include #include +#include #include #include #include -#include #include #include #include -#include #include #include From de2f197d13beb0ce3ce4a7ff66e255dc1be0217b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 27 May 2022 14:52:32 -0700 Subject: [PATCH 30/51] Cleanup headers Signed-off-by: Nghia Truong --- cpp/src/groupby/sort/sort_helper.cu | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index bf3171f1da8..a0abaf71160 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -33,17 +33,11 @@ #include #include -#include #include #include -#include #include -#include #include -#include -#include #include -#include #include #include From becb5931bae8b553affb21c714c4d40ddaa1c450 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 27 May 2022 19:56:42 -0700 Subject: [PATCH 31/51] Use offsets iterator directly Signed-off-by: Nghia Truong --- cpp/include/cudf/detail/labeling/label_segments.cuh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 77e9278ed59..b29c2cd186c 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -21,7 +21,6 @@ #include #include -#include #include #include @@ -72,13 +71,13 @@ void label_segments(InputIterator offsets_begin, using OutputType = typename thrust::iterator_value::type; thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); thrust::for_each(rmm::exec_policy(stream), - thrust::make_counting_iterator(size_type{1}), - thrust::make_counting_iterator(num_segments), + offsets_begin + 1, + offsets_end, [offsets = offsets_begin, output = out_begin] __device__(auto const idx) { // Zero-normalized offsets. - auto const dst_idx = offsets[idx] - offsets[0]; + auto const dst_idx = idx - (*offsets); - // Scatter value `1` to the index at offsets[idx]. + // Scatter value `1` to the index at (idx - offsets[0]). // In case we have repeated offsets (i.e., we have empty segments), this // atomicAdd call will make sure the label values corresponding to these empty // segments will be skipped in the output. From e46181409f4710df3dcc45508cc0b83ad7496672 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 27 May 2022 19:58:46 -0700 Subject: [PATCH 32/51] Initialize output at first Signed-off-by: Nghia Truong --- cpp/include/cudf/detail/labeling/label_segments.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index b29c2cd186c..04d7fcbc485 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -64,12 +64,13 @@ void label_segments(InputIterator offsets_begin, OutputIterator out_end, rmm::cuda_stream_view stream) { + using OutputType = typename thrust::iterator_value::type; + thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); + auto const num_segments = static_cast(thrust::distance(offsets_begin, offsets_end)) - 1; if (num_segments <= 0) { return; } - using OutputType = typename thrust::iterator_value::type; - thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); thrust::for_each(rmm::exec_policy(stream), offsets_begin + 1, offsets_end, From b7e6d9a25a9c5dce4d902b15737b0ddef45772d0 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 27 May 2022 20:14:32 -0700 Subject: [PATCH 33/51] Fix loop, excluding the last offset value Signed-off-by: Nghia Truong --- cpp/include/cudf/detail/labeling/label_segments.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 04d7fcbc485..e8e2ae70f5f 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -72,8 +72,8 @@ void label_segments(InputIterator offsets_begin, if (num_segments <= 0) { return; } thrust::for_each(rmm::exec_policy(stream), - offsets_begin + 1, - offsets_end, + offsets_begin + 1, // exclude the first offset value + offsets_end - 1, // exclude the last offset value [offsets = offsets_begin, output = out_begin] __device__(auto const idx) { // Zero-normalized offsets. auto const dst_idx = idx - (*offsets); From bfe0bf0c05784a8ec68176bed77fa807682b5570 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 27 May 2022 20:29:09 -0700 Subject: [PATCH 34/51] Add comment Signed-off-by: Nghia Truong --- cpp/include/cudf/detail/labeling/label_segments.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index e8e2ae70f5f..43cb5079d5a 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -64,6 +64,7 @@ void label_segments(InputIterator offsets_begin, OutputIterator out_end, rmm::cuda_stream_view stream) { + // Always fill the entire output array with `0` value regardless of the input. using OutputType = typename thrust::iterator_value::type; thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); From 42ce30b88e50eff2b511ac6c072370d2c73fb793 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 27 May 2022 20:27:31 -0700 Subject: [PATCH 35/51] Rewrite comment --- cpp/include/cudf/detail/labeling/label_segments.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index c23c693081e..9b5451cff79 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -127,7 +127,7 @@ void labels_to_offsets(InputIterator labels_begin, OutputIterator out_end, rmm::cuda_stream_view stream) { - // The output offsets need to be filled with `0` value first. + // Always fill the entire output array with `0` value regardless of the input. using OutputType = typename thrust::iterator_value::type; thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); From 15d036a417acb7b5c11d2a6d933eb20690e33285 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 29 May 2022 06:19:16 -0700 Subject: [PATCH 36/51] Try to reverse `sort_helper.cu` Signed-off-by: Nghia Truong --- cpp/src/groupby/sort/sort_helper.cu | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index a0abaf71160..10201782854 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -33,11 +32,17 @@ #include #include +#include #include #include +#include #include +#include #include +#include +#include #include +#include #include #include @@ -218,13 +223,22 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels( _group_labels = std::make_unique(num_keys(stream), stream); auto& group_labels = *_group_labels; + if (num_keys(stream) == 0) return group_labels; - cudf::detail::label_segments(group_offsets(stream).begin(), - group_offsets(stream).end(), - group_labels.begin(), - group_labels.end(), - stream); + thrust::uninitialized_fill(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + index_vector::value_type{0}); + thrust::scatter(rmm::exec_policy(stream), + thrust::make_constant_iterator(1, decltype(num_groups(stream))(1)), + thrust::make_constant_iterator(1, num_groups(stream)), + group_offsets(stream).begin() + 1, + group_labels.begin()); + + thrust::inclusive_scan( + rmm::exec_policy(stream), group_labels.begin(), group_labels.end(), group_labels.begin()); + return group_labels; } From 26aed349f951d9cca009ad5258c82d5c00c076d4 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 29 May 2022 14:09:10 -0700 Subject: [PATCH 37/51] Revert "Try to reverse `sort_helper.cu`" This reverts commit 15d036a417acb7b5c11d2a6d933eb20690e33285. --- cpp/src/groupby/sort/sort_helper.cu | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index 10201782854..a0abaf71160 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -32,17 +33,11 @@ #include #include -#include #include #include -#include #include -#include #include -#include -#include #include -#include #include #include @@ -223,22 +218,13 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels( _group_labels = std::make_unique(num_keys(stream), stream); auto& group_labels = *_group_labels; - if (num_keys(stream) == 0) return group_labels; - thrust::uninitialized_fill(rmm::exec_policy(stream), - group_labels.begin(), - group_labels.end(), - index_vector::value_type{0}); - thrust::scatter(rmm::exec_policy(stream), - thrust::make_constant_iterator(1, decltype(num_groups(stream))(1)), - thrust::make_constant_iterator(1, num_groups(stream)), - group_offsets(stream).begin() + 1, - group_labels.begin()); - - thrust::inclusive_scan( - rmm::exec_policy(stream), group_labels.begin(), group_labels.end(), group_labels.begin()); - + cudf::detail::label_segments(group_offsets(stream).begin(), + group_offsets(stream).end(), + group_labels.begin(), + group_labels.end(), + stream); return group_labels; } From a9930b168248dc7a0c2e298d10e242348109fad6 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 29 May 2022 14:17:00 -0700 Subject: [PATCH 38/51] Handle the special case when the output array is empty Signed-off-by: Nghia Truong --- cpp/include/cudf/detail/labeling/label_segments.cuh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 43cb5079d5a..d8aad7705d1 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -68,9 +68,12 @@ void label_segments(InputIterator offsets_begin, using OutputType = typename thrust::iterator_value::type; thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); - auto const num_segments = - static_cast(thrust::distance(offsets_begin, offsets_end)) - 1; - if (num_segments <= 0) { return; } + // Size of the input offset array needs to be at least two to properly define segments. + if (thrust::distance(offsets_begin, offsets_end) <= 1) { return; } + + // If the output array is empty, that means we have all empty segments. + // In such cases, there will not be any label value to output. + if (thrust::distance(out_begin, out_end) == 0) { return; } thrust::for_each(rmm::exec_policy(stream), offsets_begin + 1, // exclude the first offset value From 10812bbf3a4829ecfb36ad6262fa33a1e8790f98 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 29 May 2022 14:19:10 -0700 Subject: [PATCH 39/51] Reorganize code Signed-off-by: Nghia Truong --- cpp/include/cudf/detail/labeling/label_segments.cuh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index d8aad7705d1..5d9f275dfd0 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -64,17 +64,17 @@ void label_segments(InputIterator offsets_begin, OutputIterator out_end, rmm::cuda_stream_view stream) { - // Always fill the entire output array with `0` value regardless of the input. + // If the output array is empty, that means we have all empty segments. + // In such cases, there will not be any label value to output. + if (thrust::distance(out_begin, out_end) == 0) { return; } + + // When the output array is not empty, always fill it with `0` value first. using OutputType = typename thrust::iterator_value::type; thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); // Size of the input offset array needs to be at least two to properly define segments. if (thrust::distance(offsets_begin, offsets_end) <= 1) { return; } - // If the output array is empty, that means we have all empty segments. - // In such cases, there will not be any label value to output. - if (thrust::distance(out_begin, out_end) == 0) { return; } - thrust::for_each(rmm::exec_policy(stream), offsets_begin + 1, // exclude the first offset value offsets_end - 1, // exclude the last offset value From 847311b3148c90cdb9e69eb89258b57ef19e0af0 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 29 May 2022 14:23:41 -0700 Subject: [PATCH 40/51] Add a test Signed-off-by: Nghia Truong --- cpp/tests/lists/drop_list_duplicates_tests.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index 945d138c789..e6b98da11cf 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -204,8 +204,16 @@ TEST_F(DropListDuplicatesTest, StringTestsNonNull) { // Trivial cases - empty input. { - auto const lists = StrListsCol{{}}; - auto const expected = StrListsCol{{}}; + auto const lists = StrListsCol{}; + auto const expected = StrListsCol{}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + } + + // Empty input lists. + { + auto const lists = StrListsCol{{}, {}, {}}; + auto const expected = StrListsCol{{}, {}, {}}; auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); } From fcc48e9760e4dbd1a3b0536c2769e421c779ce67 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 29 May 2022 15:13:19 -0700 Subject: [PATCH 41/51] Simplify code --- cpp/include/cudf/detail/labeling/label_segments.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 12f074ba206..98a820cc035 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -135,8 +135,7 @@ void labels_to_offsets(InputIterator labels_begin, using OutputType = typename thrust::iterator_value::type; thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); - auto const num_labels = static_cast(thrust::distance(labels_begin, labels_end)); - if (num_labels == 0) { return; } + if (thrust::distance(labels_begin, labels_end) == 0) { return; } auto const num_segments = static_cast(thrust::distance(out_begin, out_end)) - 1; From 1e7b843943cd6b9bca51995a09501229df16e695 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 29 May 2022 19:38:40 -0700 Subject: [PATCH 42/51] Modify test Signed-off-by: Nghia Truong --- .../lists/drop_list_duplicates_tests.cpp | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp index e6b98da11cf..54d7ba0a95e 100644 --- a/cpp/tests/lists/drop_list_duplicates_tests.cpp +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -204,16 +204,8 @@ TEST_F(DropListDuplicatesTest, StringTestsNonNull) { // Trivial cases - empty input. { - auto const lists = StrListsCol{}; - auto const expected = StrListsCol{}; - auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); - } - - // Empty input lists. - { - auto const lists = StrListsCol{{}, {}, {}}; - auto const expected = StrListsCol{{}, {}, {}}; + auto const lists = StrListsCol{{}}; + auto const expected = StrListsCol{{}}; auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); } @@ -327,6 +319,19 @@ TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests) CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity); } + // All input lists are empty. + { + auto const lists = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}}; + auto const expected = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}}; + auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity); + + auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates( + cudf::lists_column_view{lists}, cudf::lists_column_view{lists}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity); + } + // Trivial cases. { auto const lists = ListsCol{0, 1, 2, 3, 4, 5}; From 6e098a28be9f3c80b2d3e672ed442328b8e6730b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 30 May 2022 08:02:56 -0700 Subject: [PATCH 43/51] Rewrite comment Signed-off-by: Nghia Truong --- cpp/include/cudf/detail/labeling/label_segments.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 5d9f275dfd0..f85cd7bc69b 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -65,7 +65,8 @@ void label_segments(InputIterator offsets_begin, rmm::cuda_stream_view stream) { // If the output array is empty, that means we have all empty segments. - // In such cases, there will not be any label value to output. + // In such cases, we must terminate immediately. Otherwise, the for loop below may try to access + // memory of the output array, resulting in "illegal memory access" error. if (thrust::distance(out_begin, out_end) == 0) { return; } // When the output array is not empty, always fill it with `0` value first. From 8dd7f2d6a31aa4e48718fe545082fcedeec2d8cd Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 30 May 2022 09:55:37 -0700 Subject: [PATCH 44/51] Change termination condition Signed-off-by: Nghia Truong --- cpp/include/cudf/detail/labeling/label_segments.cuh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index f85cd7bc69b..a1329fdf99c 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -73,8 +73,9 @@ void label_segments(InputIterator offsets_begin, using OutputType = typename thrust::iterator_value::type; thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); - // Size of the input offset array needs to be at least two to properly define segments. - if (thrust::distance(offsets_begin, offsets_end) <= 1) { return; } + // If the offsets array has no more than 2 offset values, there will be at max 1 segment. + // In such cases, the output will just be an array of all `0` values (which we already filled). + if (thrust::distance(offsets_begin, offsets_end) <= 2) { return; } thrust::for_each(rmm::exec_policy(stream), offsets_begin + 1, // exclude the first offset value @@ -85,8 +86,8 @@ void label_segments(InputIterator offsets_begin, // Scatter value `1` to the index at (idx - offsets[0]). // In case we have repeated offsets (i.e., we have empty segments), this - // atomicAdd call will make sure the label values corresponding to these empty - // segments will be skipped in the output. + // `atomicAdd` call will make sure the label values corresponding to these + // empty segments will be skipped in the output. atomicAdd(&output[dst_idx], OutputType{1}); }); thrust::inclusive_scan(rmm::exec_policy(stream), out_begin, out_end, out_begin); From 64a107ca6dc65572efdeeee12d12ce06180ae36d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 31 May 2022 14:22:00 -0700 Subject: [PATCH 45/51] Add comment Signed-off-by: Nghia Truong --- cpp/include/cudf/detail/labeling/label_segments.cuh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index a1329fdf99c..991836390e2 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -65,8 +65,8 @@ void label_segments(InputIterator offsets_begin, rmm::cuda_stream_view stream) { // If the output array is empty, that means we have all empty segments. - // In such cases, we must terminate immediately. Otherwise, the for loop below may try to access - // memory of the output array, resulting in "illegal memory access" error. + // In such cases, we must terminate immediately. Otherwise, the `for_each` loop below may try to + // access memory of the output array, resulting in "illegal memory access" error. if (thrust::distance(out_begin, out_end) == 0) { return; } // When the output array is not empty, always fill it with `0` value first. @@ -75,6 +75,9 @@ void label_segments(InputIterator offsets_begin, // If the offsets array has no more than 2 offset values, there will be at max 1 segment. // In such cases, the output will just be an array of all `0` values (which we already filled). + // We should terminate here, otherwise the `for_each` loop and `inclusive_scan` below sill do + // their entire computation. That is unnecessary but expensive if we have the input offsets + // defining a very large segment. if (thrust::distance(offsets_begin, offsets_end) <= 2) { return; } thrust::for_each(rmm::exec_policy(stream), From 77002efb56c4c69b25284e2ce98b5983fd1e4774 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 31 May 2022 16:43:05 -0700 Subject: [PATCH 46/51] Fix comment Signed-off-by: Nghia Truong --- cpp/include/cudf/detail/labeling/label_segments.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 991836390e2..b9533c12de0 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -75,9 +75,9 @@ void label_segments(InputIterator offsets_begin, // If the offsets array has no more than 2 offset values, there will be at max 1 segment. // In such cases, the output will just be an array of all `0` values (which we already filled). - // We should terminate here, otherwise the `for_each` loop and `inclusive_scan` below sill do - // their entire computation. That is unnecessary but expensive if we have the input offsets - // defining a very large segment. + // We should terminate here, otherwise the `inclusive_scan` call below still do its entire + // computation. That is unnecessary and may be expensive if we have the input offsets defining a + // very large segment. if (thrust::distance(offsets_begin, offsets_end) <= 2) { return; } thrust::for_each(rmm::exec_policy(stream), From 136511d38046c9262ce67617a5cb39ba6f647807 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 31 May 2022 16:52:52 -0700 Subject: [PATCH 47/51] Rename `out_` iterators into `label_` Signed-off-by: Nghia Truong --- .../cudf/detail/labeling/label_segments.cuh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index b9533c12de0..707a28424e5 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -53,25 +53,25 @@ namespace cudf::detail { * * @param offsets_begin The beginning of the offsets that define segments. * @param offsets_end The end of the offsets that define segments. - * @param out_begin The beginning of the output label range. - * @param out_end The end of the output label range. + * @param label_begin The beginning of the output label range. + * @param label_end The end of the output label range. * @param stream CUDA stream used for device memory operations and kernel launches. */ template void label_segments(InputIterator offsets_begin, InputIterator offsets_end, - OutputIterator out_begin, - OutputIterator out_end, + OutputIterator label_begin, + OutputIterator label_end, rmm::cuda_stream_view stream) { // If the output array is empty, that means we have all empty segments. // In such cases, we must terminate immediately. Otherwise, the `for_each` loop below may try to // access memory of the output array, resulting in "illegal memory access" error. - if (thrust::distance(out_begin, out_end) == 0) { return; } + if (thrust::distance(label_begin, label_end) == 0) { return; } // When the output array is not empty, always fill it with `0` value first. using OutputType = typename thrust::iterator_value::type; - thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); + thrust::uninitialized_fill(rmm::exec_policy(stream), label_begin, label_end, OutputType{0}); // If the offsets array has no more than 2 offset values, there will be at max 1 segment. // In such cases, the output will just be an array of all `0` values (which we already filled). @@ -83,7 +83,7 @@ void label_segments(InputIterator offsets_begin, thrust::for_each(rmm::exec_policy(stream), offsets_begin + 1, // exclude the first offset value offsets_end - 1, // exclude the last offset value - [offsets = offsets_begin, output = out_begin] __device__(auto const idx) { + [offsets = offsets_begin, output = label_begin] __device__(auto const idx) { // Zero-normalized offsets. auto const dst_idx = idx - (*offsets); @@ -93,7 +93,7 @@ void label_segments(InputIterator offsets_begin, // empty segments will be skipped in the output. atomicAdd(&output[dst_idx], OutputType{1}); }); - thrust::inclusive_scan(rmm::exec_policy(stream), out_begin, out_end, out_begin); + thrust::inclusive_scan(rmm::exec_policy(stream), label_begin, label_end, label_begin); } } // namespace cudf::detail From 6578a3e4d122ec45b9a7d93dec3fccbcf556575b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 31 May 2022 21:44:45 -0700 Subject: [PATCH 48/51] Rewrite comments --- .../cudf/detail/labeling/label_segments.cuh | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 2425fb0baf8..9c366aae8c5 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -78,7 +78,7 @@ void label_segments(InputIterator offsets_begin, // If the offsets array has no more than 2 offset values, there will be at max 1 segment. // In such cases, the output will just be an array of all `0` values (which we already filled). - // We should terminate here, otherwise the `inclusive_scan` call below still do its entire + // We should terminate from here, otherwise the `inclusive_scan` call below still do its entire // computation. That is unnecessary and may be expensive if we have the input offsets defining a // very large segment. if (thrust::distance(offsets_begin, offsets_end) <= 2) { return; } @@ -125,24 +125,31 @@ void label_segments(InputIterator offsets_begin, * * @param labels_begin The beginning of the labels that define segments. * @param labels_end The end of the labels that define segments. - * @param out_begin The beginning of the output offset range. - * @param out_end The end of the output offset range. + * @param offsets_begin The beginning of the output offset range. + * @param offsets_end The end of the output offset range. * @param stream CUDA stream used for device memory operations and kernel launches. */ template void labels_to_offsets(InputIterator labels_begin, InputIterator labels_end, - OutputIterator out_begin, - OutputIterator out_end, + OutputIterator offsets_begin, + OutputIterator offsets_end, rmm::cuda_stream_view stream) { // Always fill the entire output array with `0` value regardless of the input. using OutputType = typename thrust::iterator_value::type; - thrust::uninitialized_fill(rmm::exec_policy(stream), out_begin, out_end, OutputType{0}); - + thrust::uninitialized_fill(rmm::exec_policy(stream), offsets_begin, offsets_end, OutputType{0}); + + // If there is not any label value, we will have zero segment or all empty segments. We should + // terminate from here because: + // - If we have zero segment, `num_segments` computed below will be negative which may cascade to + // undefined behavior if we continue. + // - If we have all empty segments, the output offset values will be all `0`, which we already + // filled above. if (thrust::distance(labels_begin, labels_end) == 0) { return; } - auto const num_segments = static_cast(thrust::distance(out_begin, out_end)) - 1; + auto const num_segments = + static_cast(thrust::distance(offsets_begin, offsets_end)) - 1; //================================================================================ // Let consider an example: Given input labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ]. @@ -155,27 +162,27 @@ void labels_to_offsets(InputIterator labels_begin, // Given the example above, we will have this array containing [4, 2, 4]. auto list_sizes = rmm::device_uvector(num_segments, stream); - // Count the numbers of unique labels in the input. + // Count the numbers of labels in the each segment. auto const end = thrust::reduce_by_key(rmm::exec_policy(stream), labels_begin, // keys labels_end, // keys thrust::make_constant_iterator(1), - list_indices.begin(), // output unique input labels + list_indices.begin(), // output unique label values list_sizes.begin()); // count for each label auto const num_non_empty_segments = thrust::distance(list_indices.begin(), end.first); // Scatter segment sizes into the end position of their corresponding segment indices. - // Given the example above, we scatter [4, 2, 4] by the scatter_map [0, 1, 4], resulting + // Given the example above, we scatter [4, 2, 4] by the scatter map [0, 1, 4], resulting // output = [4, 2, 0, 0, 4, 0]. thrust::scatter(rmm::exec_policy(stream), list_sizes.begin(), list_sizes.begin() + num_non_empty_segments, list_indices.begin(), - out_begin); + offsets_begin); // Generate offsets from sizes. // Given the example above, the final output is [0, 4, 6, 6, 6, 10]. - thrust::exclusive_scan(rmm::exec_policy(stream), out_begin, out_end, out_begin); + thrust::exclusive_scan(rmm::exec_policy(stream), offsets_begin, offsets_end, offsets_begin); } } // namespace cudf::detail From 88d555437d2a15331f781be22165bc9a80dfdf3a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 1 Jun 2022 11:04:18 -0700 Subject: [PATCH 49/51] Update comments --- cpp/include/cudf/detail/labeling/label_segments.cuh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 9c366aae8c5..df96706d0a3 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -78,7 +78,7 @@ void label_segments(InputIterator offsets_begin, // If the offsets array has no more than 2 offset values, there will be at max 1 segment. // In such cases, the output will just be an array of all `0` values (which we already filled). - // We should terminate from here, otherwise the `inclusive_scan` call below still do its entire + // We should terminate from here, otherwise the `inclusive_scan` call below still does its entire // computation. That is unnecessary and may be expensive if we have the input offsets defining a // very large segment. if (thrust::distance(offsets_begin, offsets_end) <= 2) { return; } @@ -145,14 +145,16 @@ void labels_to_offsets(InputIterator labels_begin, // - If we have zero segment, `num_segments` computed below will be negative which may cascade to // undefined behavior if we continue. // - If we have all empty segments, the output offset values will be all `0`, which we already - // filled above. + // filled above. If we continue, the `exclusive_scan` call below still does its entire + // computation. That is unnecessary and may be expensive if we have the input labels defining + // a very large number of segments. if (thrust::distance(labels_begin, labels_end) == 0) { return; } auto const num_segments = static_cast(thrust::distance(offsets_begin, offsets_end)) - 1; //================================================================================ - // Let consider an example: Given input labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ]. + // Let's consider an example: Given input labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ]. // This stores the unique label values. // Given the example above, we will have this array containing [0, 1, 4]. From c13b4b9e00e2274b3a9a05a0cb468891d94dcb9a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 3 Jun 2022 12:37:26 -0700 Subject: [PATCH 50/51] Change all `size_type` into `OutputType` --- cpp/include/cudf/detail/labeling/label_segments.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index df96706d0a3..9cf253f7b2b 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -151,24 +151,24 @@ void labels_to_offsets(InputIterator labels_begin, if (thrust::distance(labels_begin, labels_end) == 0) { return; } auto const num_segments = - static_cast(thrust::distance(offsets_begin, offsets_end)) - 1; + static_cast(thrust::distance(offsets_begin, offsets_end)) - 1; //================================================================================ // Let's consider an example: Given input labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ]. // This stores the unique label values. // Given the example above, we will have this array containing [0, 1, 4]. - auto list_indices = rmm::device_uvector(num_segments, stream); + auto list_indices = rmm::device_uvector(num_segments, stream); // Stores the non-zero segment sizes. // Given the example above, we will have this array containing [4, 2, 4]. - auto list_sizes = rmm::device_uvector(num_segments, stream); + auto list_sizes = rmm::device_uvector(num_segments, stream); // Count the numbers of labels in the each segment. auto const end = thrust::reduce_by_key(rmm::exec_policy(stream), labels_begin, // keys labels_end, // keys - thrust::make_constant_iterator(1), + thrust::make_constant_iterator(1), list_indices.begin(), // output unique label values list_sizes.begin()); // count for each label auto const num_non_empty_segments = thrust::distance(list_indices.begin(), end.first); From 7a23dcded25e9658e7834033e070ede061ac55e9 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 3 Jun 2022 12:44:04 -0700 Subject: [PATCH 51/51] Remove casting for `num_segments` --- cpp/include/cudf/detail/labeling/label_segments.cuh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/labeling/label_segments.cuh b/cpp/include/cudf/detail/labeling/label_segments.cuh index 9cf253f7b2b..e30f5b3ee91 100644 --- a/cpp/include/cudf/detail/labeling/label_segments.cuh +++ b/cpp/include/cudf/detail/labeling/label_segments.cuh @@ -142,16 +142,15 @@ void labels_to_offsets(InputIterator labels_begin, // If there is not any label value, we will have zero segment or all empty segments. We should // terminate from here because: - // - If we have zero segment, `num_segments` computed below will be negative which may cascade to - // undefined behavior if we continue. + // - If we have zero segment, the output array is empty thus `num_segments` computed below is + // wrong and may cascade to undefined behavior if we continue. // - If we have all empty segments, the output offset values will be all `0`, which we already // filled above. If we continue, the `exclusive_scan` call below still does its entire // computation. That is unnecessary and may be expensive if we have the input labels defining // a very large number of segments. if (thrust::distance(labels_begin, labels_end) == 0) { return; } - auto const num_segments = - static_cast(thrust::distance(offsets_begin, offsets_end)) - 1; + auto const num_segments = thrust::distance(offsets_begin, offsets_end) - 1; //================================================================================ // Let's consider an example: Given input labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ].