Skip to content

Commit

Permalink
Migrate lists/contains to pylibcudf (#15981)
Browse files Browse the repository at this point in the history
Part of #15162.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #15981
  • Loading branch information
Matt711 authored Jun 28, 2024
1 parent c40e0cc commit 565c0d1
Show file tree
Hide file tree
Showing 9 changed files with 281 additions and 69 deletions.
3 changes: 2 additions & 1 deletion cpp/include/cudf/lists/lists_column_view.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -38,6 +38,7 @@ namespace cudf {
*/
class lists_column_view : private column_view {
public:
lists_column_view() = default;
/**
* @brief Construct a new lists column view object from a column view.
*
Expand Down
72 changes: 20 additions & 52 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ from libcpp.utility cimport move
from cudf._lib.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.lists.contains cimport (
contains,
index_of as cpp_index_of,
)
from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
count_elements as cpp_count_elements,
)
Expand All @@ -26,19 +22,19 @@ from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
distinct as cpp_distinct,
)
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
from cudf._lib.pylibcudf.libcudf.types cimport (
nan_equality,
null_equality,
null_order,
order,
size_type,
)
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.utils cimport columns_from_pylibcudf_table

from cudf._lib import pylibcudf

from cudf._lib.pylibcudf cimport Scalar


@acquire_spill_lock()
def count_elements(Column col):
Expand Down Expand Up @@ -153,64 +149,36 @@ def extract_element_column(Column col, Column index):


@acquire_spill_lock()
def contains_scalar(Column col, object py_search_key):

cdef DeviceScalar search_key = py_search_key.device_value

cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
def contains_scalar(Column col, py_search_key):
return Column.from_pylibcudf(
pylibcudf.lists.contains(
col.to_pylibcudf(mode="read"),
<Scalar> py_search_key.device_value.c_value,
)
)
cdef const scalar* search_key_value = search_key.get_raw_ptr()

cdef unique_ptr[column] c_result

with nogil:
c_result = move(contains(
list_view.get()[0],
search_key_value[0],
))
result = Column.from_unique_ptr(move(c_result))
return result


@acquire_spill_lock()
def index_of_scalar(Column col, object py_search_key):

cdef DeviceScalar search_key = py_search_key.device_value

cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
return Column.from_pylibcudf(
pylibcudf.lists.index_of(
col.to_pylibcudf(mode="read"),
<Scalar> py_search_key.device_value.c_value,
True,
)
)
cdef const scalar* search_key_value = search_key.get_raw_ptr()

cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_index_of(
list_view.get()[0],
search_key_value[0],
))
return Column.from_unique_ptr(move(c_result))


@acquire_spill_lock()
def index_of_column(Column col, Column search_keys):

cdef column_view keys_view = search_keys.view()

cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
return Column.from_pylibcudf(
pylibcudf.lists.index_of(
col.to_pylibcudf(mode="read"),
search_keys.to_pylibcudf(mode="read"),
True,
)
)

cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_index_of(
list_view.get()[0],
keys_view,
))
return Column.from_unique_ptr(move(c_result))


@acquire_spill_lock()
def concatenate_rows(list source_columns):
Expand Down
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/column.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
column_view,
mutable_column_view,
)
from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
lists_column_view,
)
from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type, size_type

from .gpumemoryview cimport gpumemoryview
Expand Down Expand Up @@ -56,3 +59,4 @@ cdef class ListColumnView:
cdef Column _column
cpdef child(self)
cpdef offsets(self)
cdef lists_column_view view(self) nogil
9 changes: 9 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,15 @@ cdef class ListColumnView:
"""The offsets column of the underlying list column."""
return self._column.child(1)

cdef lists_column_view view(self) nogil:
"""Generate a libcudf lists_column_view to pass to libcudf algorithms.
This method is for pylibcudf's functions to use to generate inputs when
calling libcudf algorithms, and should generally not be needed by users
(even direct pylibcudf Cython users).
"""
return lists_column_view(self._column.view())


@functools.cache
def _datatype_from_dtype_desc(desc):
Expand Down
29 changes: 23 additions & 6 deletions python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

from libc.stdint cimport int32_t
from libcpp.memory cimport unique_ptr

from cudf._lib.exception_handler cimport cudf_exception_handler
Expand All @@ -12,17 +13,33 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar


cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:

cpdef enum class duplicate_find_option(int32_t):
FIND_FIRST
FIND_LAST

cdef unique_ptr[column] contains(
lists_column_view lists,
scalar search_key,
const lists_column_view& lists,
const scalar& search_key,
) except +cudf_exception_handler

cdef unique_ptr[column] contains(
const lists_column_view& lists,
const column_view& search_keys,
) except +cudf_exception_handler

cdef unique_ptr[column] contains_nulls(
const lists_column_view& lists,
) except +cudf_exception_handler

cdef unique_ptr[column] index_of(
lists_column_view lists,
scalar search_key,
const lists_column_view& lists,
const scalar& search_key,
duplicate_find_option find_option,
) except +cudf_exception_handler

cdef unique_ptr[column] index_of(
lists_column_view lists,
column_view search_keys,
const lists_column_view& lists,
const column_view& search_keys,
duplicate_find_option find_option,
) except +cudf_exception_handler
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type

cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
cdef cppclass lists_column_view(column_view):
lists_column_view() except +
lists_column_view(const column_view& lists_column) except +
column_view parent() except +
column_view offsets() except +
Expand Down
10 changes: 10 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/lists.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,21 @@ from libcpp cimport bool
from cudf._lib.pylibcudf.libcudf.types cimport size_type

from .column cimport Column
from .scalar cimport Scalar
from .table cimport Table

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Table explode_outer(Table, size_type explode_column_idx)

cpdef Column concatenate_rows(Table)

cpdef Column concatenate_list_elements(Column, bool dropna)

cpdef Column contains(Column, ColumnOrScalar)

cpdef Column contains_nulls(Column)

cpdef Column index_of(Column, ColumnOrScalar, bool)
124 changes: 118 additions & 6 deletions python/cudf/cudf/_lib/pylibcudf/lists.pyx
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode
from cudf._lib.pylibcudf.libcudf.lists cimport (
contains as cpp_contains,
explode as cpp_explode,
)
from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
concatenate_list_elements as cpp_concatenate_list_elements,
concatenate_null_policy,
concatenate_rows as cpp_concatenate_rows,
)
from cudf._lib.pylibcudf.libcudf.table.table cimport table
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.pylibcudf.lists cimport ColumnOrScalar

from .column cimport Column
from .column cimport Column, ListColumnView
from .scalar cimport Scalar
from .table cimport Table


Expand Down Expand Up @@ -71,15 +77,15 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
----------
input : Column
The input column
dropna : bool
If true, null list elements will be ignored
from concatenation. Otherwise any input null values will result in
the corresponding output row being set to null.
Returns
-------
Column
A new Column of concatenated list elements
dropna : bool
If true, null list elements will be ignored
from concatenation. Otherwise any input null values will result in
the corresponding output row being set to null.
"""
cdef concatenate_null_policy null_policy = (
concatenate_null_policy.IGNORE if dropna
Expand All @@ -94,3 +100,109 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
))

return Column.from_libcudf(move(c_result))


cpdef Column contains(Column input, ColumnOrScalar search_key):
"""Create a column of bool values indicating whether
the search_key is contained in the input.
``search_key`` may be a
:py:class:`~cudf._lib.pylibcudf.column.Column` or a
:py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
For details, see :cpp:func:`contains`.
Parameters
----------
input : Column
The input column.
search_key : Union[Column, Scalar]
The search key.
Returns
-------
Column
A new Column of bools indicating if the search_key was
found in the list column.
"""
cdef unique_ptr[column] c_result
cdef ListColumnView list_view = input.list_view()

if not isinstance(search_key, (Column, Scalar)):
raise TypeError("Must pass a Column or Scalar")

with nogil:
c_result = move(cpp_contains.contains(
list_view.view(),
search_key.view() if ColumnOrScalar is Column else dereference(
search_key.get()
),
))
return Column.from_libcudf(move(c_result))


cpdef Column contains_nulls(Column input):
"""Create a column of bool values indicating whether
each row in the lists column contains a null value.
Parameters
----------
input : Column
The input column.
Returns
-------
Column
A new Column of bools indicating if the list column
contains a null value.
"""
cdef unique_ptr[column] c_result
cdef ListColumnView list_view = input.list_view()
with nogil:
c_result = move(cpp_contains.contains_nulls(list_view.view()))
return Column.from_libcudf(move(c_result))


cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_option):
"""Create a column of index values indicating the position of a search
key row within the corresponding list row in the lists column.
``search_key`` may be a
:py:class:`~cudf._lib.pylibcudf.column.Column` or a
:py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
For details, see :cpp:func:`index_of`.
Parameters
----------
input : Column
The input column.
search_key : Union[Column, Scalar]
The search key.
find_first_option : bool
If true, index_of returns the first match.
Otherwise the last match is returned.
Returns
-------
Column
A new Column of index values that indicate where in the
list column tthe search_key was found. An index value
of -1 indicates that the search_key was not found.
"""
cdef unique_ptr[column] c_result
cdef ListColumnView list_view = input.list_view()
cdef cpp_contains.duplicate_find_option find_option = (
cpp_contains.duplicate_find_option.FIND_FIRST if find_first_option
else cpp_contains.duplicate_find_option.FIND_LAST
)

with nogil:
c_result = move(cpp_contains.index_of(
list_view.view(),
search_key.view() if ColumnOrScalar is Column else dereference(
search_key.get()
),
find_option,
))
return Column.from_libcudf(move(c_result))
Loading

0 comments on commit 565c0d1

Please sign in to comment.