diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 549b8bae12a..6898ae4941c 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -101,7 +101,9 @@ def __getitem__(self, key): def __contains__(self, item): return item in self._values - def _copy_type_metadata(self: BaseIndexT, other: BaseIndexT) -> BaseIndexT: + def _copy_type_metadata( + self: BaseIndexT, other: BaseIndexT, *, override_dtypes=None + ) -> BaseIndexT: raise NotImplementedError def get_level_values(self, level): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c07a88e9396..15bef5cad00 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -39,7 +39,7 @@ import cudf import cudf.core.common from cudf import _lib as libcudf -from cudf._typing import ColumnLike, NotImplementedType +from cudf._typing import ColumnLike, Dtype, NotImplementedType from cudf.api.types import ( _is_scalar_or_zero_d_array, is_bool_dtype, @@ -6536,9 +6536,14 @@ def _from_columns_like_self( columns: List[ColumnBase], column_names: abc.Iterable[str], index_names: Optional[List[str]] = None, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, ) -> DataFrame: result = super()._from_columns_like_self( - columns, column_names, index_names + columns, + column_names, + index_names, + override_dtypes=override_dtypes, ) result._set_column_names_like(self) return result diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ec78a8a37cf..5183090c0df 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3,6 +3,7 @@ from __future__ import annotations import copy +import itertools import operator import pickle import warnings @@ -131,6 +132,8 @@ def _from_columns_like_self( self, columns: List[ColumnBase], column_names: Optional[abc.Iterable[str]] = None, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, ): """Construct a Frame from a list of columns with metadata from self. @@ -139,7 +142,7 @@ def _from_columns_like_self( if column_names is None: column_names = self._column_names frame = self.__class__._from_columns(columns, column_names) - return frame._copy_type_metadata(self) + return frame._copy_type_metadata(self, override_dtypes=override_dtypes) def _mimic_inplace( self: T, result: T, inplace: bool = False @@ -1160,17 +1163,31 @@ def _positions_from_column_names(self, column_names): if name in set(column_names) ] - def _copy_type_metadata(self: T, other: T) -> T: + def _copy_type_metadata( + self: T, + other: T, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + ) -> T: """ Copy type metadata from each column of `other` to the corresponding column of `self`. + + If override_dtypes is provided, any non-None entry + will be used in preference to the relevant column of other to + provide the new dtype. + See `ColumnBase._with_type_metadata` for more information. """ - for name, col, other_col in zip( - self._data.keys(), self._data.values(), other._data.values() - ): + if override_dtypes is None: + override_dtypes = itertools.repeat(None) + dtypes = ( + dtype if dtype is not None else col.dtype + for (dtype, col) in zip(override_dtypes, other._data.values()) + ) + for (name, col), dtype in zip(self._data.items(), dtypes): self._data.set_by_label( - name, col._with_type_metadata(other_col.dtype), validate=False + name, col._with_type_metadata(dtype), validate=False ) return self diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index d1995615e0c..57a10358561 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -184,7 +184,9 @@ def __init__( # whereas _stop is an upper bound. self._end = self._start + self._step * (len(self._range) - 1) - def _copy_type_metadata(self: RangeIndex, other: RangeIndex) -> RangeIndex: + def _copy_type_metadata( + self: RangeIndex, other: RangeIndex, *, override_dtypes=None + ) -> RangeIndex: # There is no metadata to be copied for RangeIndex since it does not # have an underlying column. return self @@ -978,9 +980,11 @@ def _binaryop( # Override just to make mypy happy. @_cudf_nvtx_annotate def _copy_type_metadata( - self: GenericIndex, other: GenericIndex + self: GenericIndex, other: GenericIndex, *, override_dtypes=None ) -> GenericIndex: - return super()._copy_type_metadata(other) + return super()._copy_type_metadata( + other, override_dtypes=override_dtypes + ) @property # type: ignore @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9bda475589a..741aa62d1a0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -30,7 +30,12 @@ import cudf import cudf._lib as libcudf -from cudf._typing import ColumnLike, DataFrameOrSeries, NotImplementedType +from cudf._typing import ( + ColumnLike, + DataFrameOrSeries, + Dtype, + NotImplementedType, +) from cudf.api.types import ( _is_non_decimal_numeric_dtype, is_bool_dtype, @@ -45,6 +50,7 @@ from cudf.core._base_index import BaseIndex from cudf.core.column import ColumnBase, as_column, full from cudf.core.column_accessor import ColumnAccessor +from cudf.core.dtypes import ListDtype from cudf.core.frame import Frame from cudf.core.groupby.groupby import GroupBy from cudf.core.index import Index, RangeIndex, _index_from_columns @@ -327,18 +333,28 @@ def _from_columns_like_self( columns: List[ColumnBase], column_names: Optional[abc.Iterable[str]] = None, index_names: Optional[List[str]] = None, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, ): """Construct a `Frame` from a list of columns with metadata from self. If `index_names` is set, the first `len(index_names)` columns are used to construct the index of the frame. + + If override_dtypes is provided then any non-None entry will be + used for the dtype of the matching column in preference to the + dtype of the column in self. """ if column_names is None: column_names = self._column_names frame = self.__class__._from_columns( columns, column_names, index_names ) - return frame._copy_type_metadata(self, include_index=bool(index_names)) + return frame._copy_type_metadata( + self, + include_index=bool(index_names), + override_dtypes=override_dtypes, + ) def _mimic_inplace( self: T, result: T, inplace: bool = False @@ -899,40 +915,44 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) def _copy_type_metadata( - self: T, other: T, include_index: bool = True + self: T, + other: T, + include_index: bool = True, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, ) -> T: """ Copy type metadata from each column of `other` to the corresponding column of `self`. See `ColumnBase._with_type_metadata` for more information. """ - super()._copy_type_metadata(other) - - if include_index: - if self._index is not None and other._index is not None: - self._index._copy_type_metadata(other._index) - # When other._index is a CategoricalIndex, the current index - # will be a NumericalIndex with an underlying CategoricalColumn - # (the above _copy_type_metadata call will have converted the - # column). Calling cudf.Index on that column generates the - # appropriate index. - if isinstance( - other._index, cudf.core.index.CategoricalIndex - ) and not isinstance( - self._index, cudf.core.index.CategoricalIndex - ): - self._index = cudf.Index( - cast( - cudf.core.index.NumericIndex, self._index - )._column, - name=self._index.name, - ) - elif isinstance( - other._index, cudf.MultiIndex - ) and not isinstance(self._index, cudf.MultiIndex): - self._index = cudf.MultiIndex._from_data( - self._index._data, name=self._index.name - ) + super()._copy_type_metadata(other, override_dtypes=override_dtypes) + if ( + include_index + and self._index is not None + and other._index is not None + ): + self._index._copy_type_metadata(other._index) + # When other._index is a CategoricalIndex, the current index + # will be a NumericalIndex with an underlying CategoricalColumn + # (the above _copy_type_metadata call will have converted the + # column). Calling cudf.Index on that column generates the + # appropriate index. + if isinstance( + other._index, cudf.core.index.CategoricalIndex + ) and not isinstance( + self._index, cudf.core.index.CategoricalIndex + ): + self._index = cudf.Index( + cast(cudf.core.index.NumericIndex, self._index)._column, + name=self._index.name, + ) + elif isinstance(other._index, cudf.MultiIndex) and not isinstance( + self._index, cudf.MultiIndex + ): + self._index = cudf.MultiIndex._from_data( + self._index._data, name=self._index.name + ) return self @_cudf_nvtx_annotate @@ -3476,22 +3496,32 @@ def _explode(self, explode_column: Any, ignore_index: bool): idx = None if ignore_index else self._index.copy(deep=True) return self.__class__._from_data(data, index=idx) - explode_column_num = self._column_names.index(explode_column) + column_index = self._column_names.index(explode_column) if not ignore_index and self._index is not None: - explode_column_num += self._index.nlevels + index_offset = self._index.nlevels + else: + index_offset = 0 exploded = libcudf.lists.explode_outer( [ *(self._index._data.columns if not ignore_index else ()), *self._columns, ], - explode_column_num, + column_index + index_offset, ) - + # We must copy inner datatype of the exploded list column to + # maintain struct dtype key names + exploded_dtype = cast( + ListDtype, self._columns[column_index].dtype + ).element_type return self._from_columns_like_self( exploded, self._column_names, self._index_names if not ignore_index else None, + override_dtypes=( + exploded_dtype if i == column_index else None + for i in range(len(self._columns)) + ), ) @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 06a2cc33c1f..f53daec4e0f 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1854,7 +1854,9 @@ def _intersection(self, other, sort=None): return midx @_cudf_nvtx_annotate - def _copy_type_metadata(self: MultiIndex, other: MultiIndex) -> MultiIndex: + def _copy_type_metadata( + self: MultiIndex, other: MultiIndex, *, override_dtypes=None + ) -> MultiIndex: res = super()._copy_type_metadata(other) res._names = other._names return res diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index a321d2b430a..2583f7dc95b 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -107,6 +107,51 @@ def test_listdtype_hash(): assert hash(a) != hash(c) +@pytest.fixture(params=["int", "float", "datetime", "timedelta"]) +def leaf_value(request): + if request.param == "int": + return np.int32(1) + elif request.param == "float": + return np.float64(1) + elif request.param == "datetime": + return pd.to_datetime("1900-01-01") + elif request.param == "timedelta": + return pd.to_timedelta("10d") + else: + raise ValueError("Unhandled data type") + + +@pytest.fixture(params=["list", "struct"]) +def list_or_struct(request, leaf_value): + if request.param == "list": + return [[leaf_value], [leaf_value]] + elif request.param == "struct": + return {"a": leaf_value, "b": [leaf_value], "c": {"d": [leaf_value]}} + else: + raise ValueError("Unhandled data type") + + +@pytest.fixture(params=["list", "struct"]) +def nested_list(request, list_or_struct, leaf_value): + if request.param == "list": + return [list_or_struct, list_or_struct] + elif request.param == "struct": + return [ + { + "a": list_or_struct, + "b": leaf_value, + "c": {"d": list_or_struct, "e": leaf_value}, + } + ] + else: + raise ValueError("Unhandled data type") + + +def test_list_dtype_explode(nested_list): + sr = cudf.Series([nested_list]) + assert sr.dtype.element_type == sr.explode().dtype + + @pytest.mark.parametrize( "data", [