From 62c0ae89ab777e88c396ebb3021fba376d1e1eb7 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 13 Jul 2022 09:57:13 -0700 Subject: [PATCH] Use `cudf::lists::distinct` in Python binding (#11234) Python binding has `lists.unique()` API to extract unique list elements for the input lists column. Previously, it has been implemented by calling to `cudf::lists::drop_list_duplicates`, which performs segmented sort on the input lists and then extracts the unique list elements. This PR changes the implementation of `lists.unique()` to use `cudf::lists::distinct`, which can improve performance by using a hash table for finding distinct elements without segmented sort. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/11234 --- ...list_duplicates.pxd => stream_compaction.pxd} | 6 +++--- python/cudf/cudf/_lib/lists.pyx | 16 +++++++--------- python/cudf/cudf/core/column/lists.py | 6 ++---- 3 files changed, 12 insertions(+), 16 deletions(-) rename python/cudf/cudf/_lib/cpp/lists/{drop_list_duplicates.pxd => stream_compaction.pxd} (72%) diff --git a/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd b/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd similarity index 72% rename from python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd rename to python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd index 81d54104320..58c1ab1dcec 100644 --- a/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd +++ b/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr @@ -7,9 +7,9 @@ from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.types cimport nan_equality, null_equality -cdef extern from "cudf/lists/drop_list_duplicates.hpp" \ +cdef extern from "cudf/lists/stream_compaction.hpp" \ namespace "cudf::lists" nogil: - cdef unique_ptr[column] drop_list_duplicates( + cdef unique_ptr[column] distinct( const lists_column_view lists_column, null_equality nulls_equal, nan_equality nans_equal diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 025fb0665d3..581207c97a5 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -15,12 +15,10 @@ from cudf._lib.cpp.lists.combine cimport ( from cudf._lib.cpp.lists.count_elements cimport ( count_elements as cpp_count_elements, ) -from cudf._lib.cpp.lists.drop_list_duplicates cimport ( - drop_list_duplicates as cpp_drop_list_duplicates, -) from cudf._lib.cpp.lists.explode cimport explode_outer as cpp_explode_outer from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.lists.sorting cimport sort_lists as cpp_sort_lists +from cudf._lib.cpp.lists.stream_compaction cimport distinct as cpp_distinct from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -75,12 +73,12 @@ def explode_outer( return columns_from_unique_ptr(move(c_result)) -def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal): +def distinct(Column col, bool nulls_equal, bool nans_all_equal): """ - nans_all_equal == True indicates that libcudf should treat any two elements - from {+nan, -nan} as equal, and as unequal otherwise. nulls_equal == True indicates that libcudf should treat any two nulls as equal, and as unequal otherwise. + nans_all_equal == True indicates that libcudf should treat any two + elements from {-nan, +nan} as equal, and as unequal otherwise. """ cdef shared_ptr[lists_column_view] list_view = ( make_shared[lists_column_view](col.view()) @@ -96,9 +94,9 @@ def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal): with nogil: c_result = move( - cpp_drop_list_duplicates(list_view.get()[0], - c_nulls_equal, - c_nans_equal) + cpp_distinct(list_view.get()[0], + c_nulls_equal, + c_nans_equal) ) return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index e8a5638f07a..c6a19f374bd 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -13,7 +13,7 @@ concatenate_rows, contains_scalar, count_elements, - drop_list_duplicates, + distinct, extract_element_column, extract_element_scalar, index_of_column, @@ -603,9 +603,7 @@ def unique(self) -> ParentType: raise NotImplementedError("Nested lists unique is not supported.") return self._return_or_inplace( - drop_list_duplicates( - self._column, nulls_equal=True, nans_all_equal=True - ) + distinct(self._column, nulls_equal=True, nans_all_equal=True) ) def sort_values(