From 3a92f617e71b81cc8b3b098631c14dbc4fe56c75 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 12 Sep 2022 15:58:29 +0100 Subject: [PATCH 1/8] Transfer correct dtype to exploded column Since libcudf doesn't keep track of StructDtype key names, round-tripping through outer_explode loses information. We know the correct dtype, since it is the element_type of the exploded list column, so use attach that type metadata before handing back the return value. --- python/cudf/cudf/core/indexed_frame.py | 11 ++++++++++- python/cudf/cudf/tests/test_list.py | 23 +++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9bda475589a..a464a55dd0f 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -45,6 +45,7 @@ from cudf.core._base_index import BaseIndex from cudf.core.column import ColumnBase, as_column, full from cudf.core.column_accessor import ColumnAccessor +from cudf.core.dtypes import ListDtype from cudf.core.frame import Frame from cudf.core.groupby.groupby import GroupBy from cudf.core.index import Index, RangeIndex, _index_from_columns @@ -3487,7 +3488,15 @@ def _explode(self, explode_column: Any, ignore_index: bool): ], explode_column_num, ) - + # dtype of exploded column is the element dtype of the list + # column that was exploded, this needs to be copied over so + # that nested struct dtypes maintain the key names. + element_type = cast( + ListDtype, self._data[explode_column].dtype + ).element_type + exploded[explode_column_num] = exploded[ + explode_column_num + ]._with_type_metadata(element_type) return self._from_columns_like_self( exploded, self._column_names, diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index a321d2b430a..204b879e4a3 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -2,11 +2,13 @@ import functools import operator +from string import printable import numpy as np import pandas as pd import pyarrow as pa import pytest +from hypothesis import given, settings, strategies as st import cudf from cudf import NA @@ -107,6 +109,27 @@ def test_listdtype_hash(): assert hash(a) != hash(c) +@settings(deadline=None) +@given( + st.builds( + lambda x: [x], + st.recursive( + st.integers(min_value=-100, max_value=100), + lambda c: st.lists(c, min_size=1, max_size=1) + | st.dictionaries( + st.text(printable, min_size=1, max_size=20), + c, + min_size=1, + max_size=10, + ), + ), + ) +) +def test_list_dtype_explode(x): + sr = cudf.Series([x]) + assert sr.dtype.element_type == sr.explode().dtype + + @pytest.mark.parametrize( "data", [ From a15731dbc72b65c8019403574827653369b99b21 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 13 Sep 2022 10:17:17 +0100 Subject: [PATCH 2/8] No hypothesis this time --- python/cudf/cudf/tests/test_list.py | 62 +++++++++++++++++++---------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 204b879e4a3..2583f7dc95b 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -2,13 +2,11 @@ import functools import operator -from string import printable import numpy as np import pandas as pd import pyarrow as pa import pytest -from hypothesis import given, settings, strategies as st import cudf from cudf import NA @@ -109,24 +107,48 @@ def test_listdtype_hash(): assert hash(a) != hash(c) -@settings(deadline=None) -@given( - st.builds( - lambda x: [x], - st.recursive( - st.integers(min_value=-100, max_value=100), - lambda c: st.lists(c, min_size=1, max_size=1) - | st.dictionaries( - st.text(printable, min_size=1, max_size=20), - c, - min_size=1, - max_size=10, - ), - ), - ) -) -def test_list_dtype_explode(x): - sr = cudf.Series([x]) +@pytest.fixture(params=["int", "float", "datetime", "timedelta"]) +def leaf_value(request): + if request.param == "int": + return np.int32(1) + elif request.param == "float": + return np.float64(1) + elif request.param == "datetime": + return pd.to_datetime("1900-01-01") + elif request.param == "timedelta": + return pd.to_timedelta("10d") + else: + raise ValueError("Unhandled data type") + + +@pytest.fixture(params=["list", "struct"]) +def list_or_struct(request, leaf_value): + if request.param == "list": + return [[leaf_value], [leaf_value]] + elif request.param == "struct": + return {"a": leaf_value, "b": [leaf_value], "c": {"d": [leaf_value]}} + else: + raise ValueError("Unhandled data type") + + +@pytest.fixture(params=["list", "struct"]) +def nested_list(request, list_or_struct, leaf_value): + if request.param == "list": + return [list_or_struct, list_or_struct] + elif request.param == "struct": + return [ + { + "a": list_or_struct, + "b": leaf_value, + "c": {"d": list_or_struct, "e": leaf_value}, + } + ] + else: + raise ValueError("Unhandled data type") + + +def test_list_dtype_explode(nested_list): + sr = cudf.Series([nested_list]) assert sr.dtype.element_type == sr.explode().dtype From 08a2aa78078ddd8c8b3a096a128b71f6719824d8 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 13 Sep 2022 11:35:07 +0100 Subject: [PATCH 3/8] Factor out index metadata transfer into separate method --- python/cudf/cudf/core/indexed_frame.py | 57 +++++++++++++++----------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a464a55dd0f..cb64cf21e5a 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -908,32 +908,39 @@ def _copy_type_metadata( See `ColumnBase._with_type_metadata` for more information. """ super()._copy_type_metadata(other) - if include_index: - if self._index is not None and other._index is not None: - self._index._copy_type_metadata(other._index) - # When other._index is a CategoricalIndex, the current index - # will be a NumericalIndex with an underlying CategoricalColumn - # (the above _copy_type_metadata call will have converted the - # column). Calling cudf.Index on that column generates the - # appropriate index. - if isinstance( - other._index, cudf.core.index.CategoricalIndex - ) and not isinstance( - self._index, cudf.core.index.CategoricalIndex - ): - self._index = cudf.Index( - cast( - cudf.core.index.NumericIndex, self._index - )._column, - name=self._index.name, - ) - elif isinstance( - other._index, cudf.MultiIndex - ) and not isinstance(self._index, cudf.MultiIndex): - self._index = cudf.MultiIndex._from_data( - self._index._data, name=self._index.name - ) + self._copy_index_metadata(other) + return self + + def _copy_index_metadata(self: T, other: T) -> T: + """ + Copy type metadata from the index of `other` to the index or + `self`. + + See also :func:`_copy_type_metadata`. + """ + if self._index is not None and other._index is not None: + self._index._copy_type_metadata(other._index) + # When other._index is a CategoricalIndex, the current index + # will be a NumericalIndex with an underlying CategoricalColumn + # (the above _copy_type_metadata call will have converted the + # column). Calling cudf.Index on that column generates the + # appropriate index. + if isinstance( + other._index, cudf.core.index.CategoricalIndex + ) and not isinstance( + self._index, cudf.core.index.CategoricalIndex + ): + self._index = cudf.Index( + cast(cudf.core.index.NumericIndex, self._index)._column, + name=self._index.name, + ) + elif isinstance(other._index, cudf.MultiIndex) and not isinstance( + self._index, cudf.MultiIndex + ): + self._index = cudf.MultiIndex._from_data( + self._index._data, name=self._index.name + ) return self @_cudf_nvtx_annotate From b86fdaf1941bd3761cecde4a7e9e33c21df5cc72 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 13 Sep 2022 11:37:22 +0100 Subject: [PATCH 4/8] Sui generis copy of type metadata in _explode Partially replicates the implementation of _from_columns_like_self, but makes it explicit where the dtype is cominng from for each column. --- python/cudf/cudf/core/indexed_frame.py | 41 ++++++++++++++++++-------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index cb64cf21e5a..948e985310f 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3484,31 +3484,48 @@ def _explode(self, explode_column: Any, ignore_index: bool): idx = None if ignore_index else self._index.copy(deep=True) return self.__class__._from_data(data, index=idx) - explode_column_num = self._column_names.index(explode_column) + column_index = self._column_names.index(explode_column) if not ignore_index and self._index is not None: - explode_column_num += self._index.nlevels + index_offset = self._index.nlevels + else: + index_offset = 0 exploded = libcudf.lists.explode_outer( [ *(self._index._data.columns if not ignore_index else ()), *self._columns, ], - explode_column_num, + column_index + index_offset, ) - # dtype of exploded column is the element dtype of the list - # column that was exploded, this needs to be copied over so - # that nested struct dtypes maintain the key names. - element_type = cast( - ListDtype, self._data[explode_column].dtype + # Sui generis implementation of _from_columns_like_self that + # copies type metadata from self for all columns except the + # exploded column (for which we use the inner dtype of the + # exploded list column). This latter dtype needs to be copied + # over so that nested struct dtypes maintain the key names. + # Should this pattern become more common elsewhere, it would + # make sense to implement something like + # _from_columns_with_dtypes uniformly. + exploded_dtype = cast( + ListDtype, self._columns[column_index].dtype ).element_type - exploded[explode_column_num] = exploded[ - explode_column_num - ]._with_type_metadata(element_type) - return self._from_columns_like_self( + frame = self.__class__._from_columns( exploded, self._column_names, self._index_names if not ignore_index else None, ) + for i, (name, old_column, new_column) in enumerate( + zip(self._column_names, self._columns, exploded[index_offset:]) + ): + frame._data.set_by_label( + name, + new_column._with_type_metadata( + exploded_dtype if i == column_index else old_column.dtype + ), + validate=False, + ) + if not ignore_index: + frame._copy_index_metadata(self) + return frame @_cudf_nvtx_annotate def tile(self, count): From c1bb83eb9588af9f3ff9b1a8235082af77471bb2 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 13 Sep 2022 15:00:22 +0100 Subject: [PATCH 5/8] Back out by-hand type construction in _explode Instead allow caller of _from_columns_like_self to provide an iterable of dtypes that should override the dtypes picked up from the columns of self. --- python/cudf/cudf/core/dataframe.py | 9 +++- python/cudf/cudf/core/frame.py | 30 +++++++++-- python/cudf/cudf/core/indexed_frame.py | 73 ++++++++++++-------------- 3 files changed, 67 insertions(+), 45 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c07a88e9396..15bef5cad00 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -39,7 +39,7 @@ import cudf import cudf.core.common from cudf import _lib as libcudf -from cudf._typing import ColumnLike, NotImplementedType +from cudf._typing import ColumnLike, Dtype, NotImplementedType from cudf.api.types import ( _is_scalar_or_zero_d_array, is_bool_dtype, @@ -6536,9 +6536,14 @@ def _from_columns_like_self( columns: List[ColumnBase], column_names: abc.Iterable[str], index_names: Optional[List[str]] = None, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, ) -> DataFrame: result = super()._from_columns_like_self( - columns, column_names, index_names + columns, + column_names, + index_names, + override_dtypes=override_dtypes, ) result._set_column_names_like(self) return result diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ec78a8a37cf..1aefdfb1c2e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3,6 +3,7 @@ from __future__ import annotations import copy +import itertools import operator import pickle import warnings @@ -131,6 +132,8 @@ def _from_columns_like_self( self, columns: List[ColumnBase], column_names: Optional[abc.Iterable[str]] = None, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, ): """Construct a Frame from a list of columns with metadata from self. @@ -139,7 +142,7 @@ def _from_columns_like_self( if column_names is None: column_names = self._column_names frame = self.__class__._from_columns(columns, column_names) - return frame._copy_type_metadata(self) + return frame._copy_type_metadata(self, override_dtypes=override_dtypes) def _mimic_inplace( self: T, result: T, inplace: bool = False @@ -1160,17 +1163,34 @@ def _positions_from_column_names(self, column_names): if name in set(column_names) ] - def _copy_type_metadata(self: T, other: T) -> T: + def _copy_type_metadata( + self: T, + other: T, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + ) -> T: """ Copy type metadata from each column of `other` to the corresponding column of `self`. + + If override_dtypes is provided, any non-None entry + will be used in preference to the relevant column of other to + provide the new dtype. + See `ColumnBase._with_type_metadata` for more information. """ - for name, col, other_col in zip( - self._data.keys(), self._data.values(), other._data.values() + for (name, col), dtype in zip( + self._data.items(), + ( + dtype or col.dtype + for (dtype, col) in zip( + override_dtypes or itertools.repeat(None), + other._data.values(), + ) + ), ): self._data.set_by_label( - name, col._with_type_metadata(other_col.dtype), validate=False + name, col._with_type_metadata(dtype), validate=False ) return self diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 948e985310f..b2b2e81d5ee 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -30,7 +30,12 @@ import cudf import cudf._lib as libcudf -from cudf._typing import ColumnLike, DataFrameOrSeries, NotImplementedType +from cudf._typing import ( + ColumnLike, + DataFrameOrSeries, + Dtype, + NotImplementedType, +) from cudf.api.types import ( _is_non_decimal_numeric_dtype, is_bool_dtype, @@ -328,18 +333,28 @@ def _from_columns_like_self( columns: List[ColumnBase], column_names: Optional[abc.Iterable[str]] = None, index_names: Optional[List[str]] = None, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, ): """Construct a `Frame` from a list of columns with metadata from self. If `index_names` is set, the first `len(index_names)` columns are used to construct the index of the frame. + + If override_dtypes is provided then any non-None entry will be + used for the dtype of the matching column in preference to the + dtype of the column in self. """ if column_names is None: column_names = self._column_names frame = self.__class__._from_columns( columns, column_names, index_names ) - return frame._copy_type_metadata(self, include_index=bool(index_names)) + return frame._copy_type_metadata( + self, + include_index=bool(index_names), + override_dtypes=override_dtypes, + ) def _mimic_inplace( self: T, result: T, inplace: bool = False @@ -900,26 +915,23 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) def _copy_type_metadata( - self: T, other: T, include_index: bool = True + self: T, + other: T, + include_index: bool = True, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, ) -> T: """ Copy type metadata from each column of `other` to the corresponding column of `self`. See `ColumnBase._with_type_metadata` for more information. """ - super()._copy_type_metadata(other) - if include_index: - self._copy_index_metadata(other) - return self - - def _copy_index_metadata(self: T, other: T) -> T: - """ - Copy type metadata from the index of `other` to the index or - `self`. - - See also :func:`_copy_type_metadata`. - """ - if self._index is not None and other._index is not None: + super()._copy_type_metadata(other, override_dtypes=override_dtypes) + if ( + include_index + and self._index is not None + and other._index is not None + ): self._index._copy_type_metadata(other._index) # When other._index is a CategoricalIndex, the current index # will be a NumericalIndex with an underlying CategoricalColumn @@ -3497,35 +3509,20 @@ def _explode(self, explode_column: Any, ignore_index: bool): ], column_index + index_offset, ) - # Sui generis implementation of _from_columns_like_self that - # copies type metadata from self for all columns except the - # exploded column (for which we use the inner dtype of the - # exploded list column). This latter dtype needs to be copied - # over so that nested struct dtypes maintain the key names. - # Should this pattern become more common elsewhere, it would - # make sense to implement something like - # _from_columns_with_dtypes uniformly. + # We must copy inner datatype of the exploded list column to + # maintain struct dtype key names exploded_dtype = cast( ListDtype, self._columns[column_index].dtype ).element_type - frame = self.__class__._from_columns( + return self._from_columns_like_self( exploded, self._column_names, self._index_names if not ignore_index else None, + override_dtypes=( + exploded_dtype if i == column_index else None + for i, _ in enumerate(self._columns) + ), ) - for i, (name, old_column, new_column) in enumerate( - zip(self._column_names, self._columns, exploded[index_offset:]) - ): - frame._data.set_by_label( - name, - new_column._with_type_metadata( - exploded_dtype if i == column_index else old_column.dtype - ), - validate=False, - ) - if not ignore_index: - frame._copy_index_metadata(self) - return frame @_cudf_nvtx_annotate def tile(self, count): From 9d721779e382cac338bcb5f6864b58337b8d676e Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 13 Sep 2022 17:27:02 +0100 Subject: [PATCH 6/8] More new keyword arguments --- python/cudf/cudf/core/_base_index.py | 4 +++- python/cudf/cudf/core/index.py | 10 +++++++--- python/cudf/cudf/core/multiindex.py | 4 +++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 549b8bae12a..6898ae4941c 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -101,7 +101,9 @@ def __getitem__(self, key): def __contains__(self, item): return item in self._values - def _copy_type_metadata(self: BaseIndexT, other: BaseIndexT) -> BaseIndexT: + def _copy_type_metadata( + self: BaseIndexT, other: BaseIndexT, *, override_dtypes=None + ) -> BaseIndexT: raise NotImplementedError def get_level_values(self, level): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index d1995615e0c..57a10358561 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -184,7 +184,9 @@ def __init__( # whereas _stop is an upper bound. self._end = self._start + self._step * (len(self._range) - 1) - def _copy_type_metadata(self: RangeIndex, other: RangeIndex) -> RangeIndex: + def _copy_type_metadata( + self: RangeIndex, other: RangeIndex, *, override_dtypes=None + ) -> RangeIndex: # There is no metadata to be copied for RangeIndex since it does not # have an underlying column. return self @@ -978,9 +980,11 @@ def _binaryop( # Override just to make mypy happy. @_cudf_nvtx_annotate def _copy_type_metadata( - self: GenericIndex, other: GenericIndex + self: GenericIndex, other: GenericIndex, *, override_dtypes=None ) -> GenericIndex: - return super()._copy_type_metadata(other) + return super()._copy_type_metadata( + other, override_dtypes=override_dtypes + ) @property # type: ignore @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 06a2cc33c1f..f53daec4e0f 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1854,7 +1854,9 @@ def _intersection(self, other, sort=None): return midx @_cudf_nvtx_annotate - def _copy_type_metadata(self: MultiIndex, other: MultiIndex) -> MultiIndex: + def _copy_type_metadata( + self: MultiIndex, other: MultiIndex, *, override_dtypes=None + ) -> MultiIndex: res = super()._copy_type_metadata(other) res._names = other._names return res From 3e409e2eb42d8838daa4920cf60cbf77cfc98a83 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 14 Sep 2022 09:31:35 +0100 Subject: [PATCH 7/8] No need for enumerate --- python/cudf/cudf/core/indexed_frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index b2b2e81d5ee..741aa62d1a0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3520,7 +3520,7 @@ def _explode(self, explode_column: Any, ignore_index: bool): self._index_names if not ignore_index else None, override_dtypes=( exploded_dtype if i == column_index else None - for i, _ in enumerate(self._columns) + for i in range(len(self._columns)) ), ) From 2e06a633ce3615fe03d457a08508a062da608956 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 14 Sep 2022 09:32:51 +0100 Subject: [PATCH 8/8] Replace incomprehensible Haskell with imperative code --- python/cudf/cudf/core/frame.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 1aefdfb1c2e..5183090c0df 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1179,16 +1179,13 @@ def _copy_type_metadata( See `ColumnBase._with_type_metadata` for more information. """ - for (name, col), dtype in zip( - self._data.items(), - ( - dtype or col.dtype - for (dtype, col) in zip( - override_dtypes or itertools.repeat(None), - other._data.values(), - ) - ), - ): + if override_dtypes is None: + override_dtypes = itertools.repeat(None) + dtypes = ( + dtype if dtype is not None else col.dtype + for (dtype, col) in zip(override_dtypes, other._data.values()) + ) + for (name, col), dtype in zip(self._data.items(), dtypes): self._data.set_by_label( name, col._with_type_metadata(dtype), validate=False )