Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix issue with extracting nested column data & dtype preservation #11671

Merged
merged 11 commits into from
Sep 12, 2022
43 changes: 29 additions & 14 deletions python/cudf/cudf/_lib/interop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -87,56 +87,71 @@ cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj):
dlpack_tensor.deleter(dlpack_tensor)


cdef vector[column_metadata] gather_metadata(dict cols_dtypes) except *:
cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *:
"""
Generates a column_metadata vector for each column.

Parameters
----------
cols_dtypes : dict
A dict mapping of column names & their dtypes.
cols_dtypes : iterable
An iterable of ``(column_name, dtype)`` pairs.
"""
cdef vector[column_metadata] cpp_metadata
cpp_metadata.reserve(len(cols_dtypes))

if cols_dtypes is not None:
for idx, (col_name, col_dtype) in enumerate(cols_dtypes.items()):
for idx, (col_name, col_dtype) in enumerate(cols_dtypes):
cpp_metadata.push_back(column_metadata(col_name.encode()))
if is_struct_dtype(col_dtype):
if is_struct_dtype(col_dtype) or is_list_dtype(col_dtype):
_set_col_children_metadata(col_dtype, cpp_metadata[idx])
else:
raise TypeError(
"A dictionary of column names and dtypes is required to "
"An iterable of (column_name, dtype) pairs is required to "
"construct column_metadata"
)
return cpp_metadata

cdef _set_col_children_metadata(dtype,
column_metadata& col_meta):

cdef column_metadata element_metadata

if is_struct_dtype(dtype):
col_meta.children_meta.reserve(len(dtype.fields))
for i, name in enumerate(dtype.fields):
value = dtype.fields[name]
col_meta.children_meta.push_back(column_metadata(name.encode()))
for name, value in dtype.fields.items():
element_metadata = column_metadata(name.encode())
_set_col_children_metadata(
value, col_meta.children_meta[i]
value, element_metadata
)
col_meta.children_meta.push_back(element_metadata)
elif is_list_dtype(dtype):
col_meta.children_meta.reserve(2)
# Offsets - child 0
col_meta.children_meta.push_back(column_metadata())

# Element column - child 1
element_metadata = column_metadata()
_set_col_children_metadata(
dtype.element_type, element_metadata
)
col_meta.children_meta.push_back(element_metadata)
else:
col_meta.children_meta.push_back(column_metadata())


def to_arrow(list source_columns, dict cols_dtypes):
def to_arrow(list source_columns, object column_dtypes):
"""Convert a list of columns from
cudf Frame to a PyArrow Table.

Parameters
----------
source_columns : a list of columns to convert
cols_dtype : A dict mapping of column names & their dtypes.
column_dtypes : Iterable of ``(column_name, column_dtype)`` pairs

Returns
-------
pyarrow table
"""
cdef vector[column_metadata] cpp_metadata = gather_metadata(cols_dtypes)
cdef vector[column_metadata] cpp_metadata = gather_metadata(column_dtypes)
cdef table_view input_table_view = table_view_from_columns(source_columns)

cdef shared_ptr[CTable] cpp_arrow_table
Expand Down
11 changes: 2 additions & 9 deletions python/cudf/cudf/_lib/scalar.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ cdef _get_py_dict_from_struct(unique_ptr[scalar]& s, dtype):
children=tuple(columns),
size=1,
)
table = to_arrow([struct_col], {"None": dtype})
table = to_arrow([struct_col], [("None", dtype)])
python_dict = table.to_pydict()["None"][0]
return {k: _nested_na_replace([python_dict[k]])[0] for k in python_dict}

Expand Down Expand Up @@ -428,14 +428,7 @@ cdef _get_py_list_from_list(unique_ptr[scalar]& s, dtype):
cdef column_view list_col_view = (<list_scalar*>s.get()).view()
cdef Column element_col = Column.from_column_view(list_col_view, None)

arrow_obj = to_arrow(
[element_col],
{
"None": dtype.element_type
if isinstance(element_col, cudf.core.column.StructColumn)
else dtype
}
)["None"]
arrow_obj = to_arrow([element_col], [("None", dtype.element_type)])["None"]

result = arrow_obj.to_pylist()
return _nested_na_replace(result)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def to_arrow(self) -> pa.Array:
4
]
"""
return libcudf.interop.to_arrow([self], {"None": self.dtype})[
return libcudf.interop.to_arrow([self], [("None", self.dtype)])[
"None"
].chunk(0)

Expand Down
7 changes: 6 additions & 1 deletion python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from functools import cached_property
from typing import List, Optional, Sequence, Union
from typing import List, Optional, Sequence, Tuple, Union

import numpy as np
import pyarrow as pa
Expand Down Expand Up @@ -164,6 +164,11 @@ def set_base_data(self, value):
else:
super().set_base_data(value)

def set_base_children(self, value: Tuple[ColumnBase, ...]):
super().set_base_children(value)
_, values = value
self._dtype = cudf.ListDtype(element_type=values.dtype)

@property
def __cuda_array_interface__(self):
raise NotImplementedError(
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,7 @@ def test_memory_usage():
0,
),
([[[[1, 2]], [[2], [3]]], [[[2]]], [[[3]]]], 2),
([[[{"a": 1, "b": 2, "c": 10}]]], 0),
],
)
def test_nested_list_extract_host_scalars(data, idx):
Expand Down
23 changes: 23 additions & 0 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2545,3 +2545,26 @@ def test_parquet_columns_and_index_param(index, columns):
got = cudf.read_parquet(buffer, columns=columns)

assert_eq(expected, got, check_index_type=True)


def test_parquet_nested_struct_list():
buffer = BytesIO()
data = {
"payload": {
"Domain": {
"Name": "abc",
"Id": {"Name": "host", "Value": "127.0.0.8"},
},
"StreamId": "12345678",
"Duration": 10,
"Offset": 12,
"Resource": [{"Name": "ZoneName", "Value": "RAPIDS"}],
}
}
df = cudf.DataFrame({"a": cudf.Series(data)})

df.to_parquet(buffer)
expected = pd.read_parquet(buffer)
actual = cudf.read_parquet(buffer)
assert_eq(expected, actual)
assert_eq(actual.a.dtype, df.a.dtype)