From 9bf540add6514d734c59bea0c530f094d041eea2 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 9 Jun 2022 14:22:22 -0700 Subject: [PATCH] Use `distinct` in Cython --- .../cudf/cudf/_lib/cpp/stream_compaction.pxd | 6 ++++++ python/cudf/cudf/_lib/stream_compaction.pyx | 19 +++---------------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index 61efd040807..940d981f739 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -39,6 +39,12 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ duplicate_keep_option keep, null_equality nulls_equal) except + + cdef unique_ptr[table] distinct( + table_view source_table, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal) except + + cdef size_type distinct_count( column_view source_table, null_policy null_handling, diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index b645fcd59d0..97f9c3a2770 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -9,13 +9,12 @@ from libcpp.vector cimport vector from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.sorting cimport stable_sort_by_key as cpp_stable_sort_by_key from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, distinct_count as cpp_distinct_count, drop_nulls as cpp_drop_nulls, duplicate_keep_option, - unique as cpp_unique, + distinct as cpp_distinct, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -167,21 +166,9 @@ def drop_duplicates(list columns, cdef unique_ptr[table] c_result with nogil: - # cudf::unique keeps unique rows in each consecutive group of - # equivalent rows. To match the behavior of pandas.DataFrame. - # drop_duplicates, users need to stable sort the input first - # and then invoke cudf::unique. - sorted_source_table = move( - cpp_stable_sort_by_key( - source_table_view, - keys_view, - column_order, - null_precedence - ) - ) c_result = move( - cpp_unique( - sorted_source_table.get().view(), + cpp_distinct( + source_table_view, cpp_keys, cpp_keep_option, cpp_nulls_equal