From 4cb60e6b586338c468e04a4274a05c06811adeb7 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 10 Jul 2020 20:19:15 +0200 Subject: [PATCH 1/2] Implement BaseDtypeTests for ArrowStringDtype --- pandas/core/arrays/base.py | 6 +- pandas/core/arrays/string_arrow.py | 484 ++++++++++++++++++++ pandas/tests/extension/test_string_arrow.py | 125 +++++ setup.py | 2 +- 4 files changed, 615 insertions(+), 2 deletions(-) create mode 100644 pandas/core/arrays/string_arrow.py create mode 100644 pandas/tests/extension/test_string_arrow.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8193d65b3b30c..736d95b4b64b6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -457,9 +457,13 @@ def astype(self, dtype, copy=True): NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype + from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) - if isinstance(dtype, StringDtype): # allow conversion to StringArrays + # FIXME: Really hard-code here? + if isinstance( + dtype, (ArrowStringDtype, StringDtype) + ): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py new file mode 100644 index 0000000000000..8248a3e91c0fe --- /dev/null +++ b/pandas/core/arrays/string_arrow.py @@ -0,0 +1,484 @@ +from collections.abc import Iterable +from typing import Any, Optional, Sequence, Tuple, Type, Union + +import numpy as np +import pyarrow as pa +import pyarrow.compute as pc + +from pandas._libs import missing as libmissing +from pandas._typing import ArrayLike + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.dtypes import register_extension_dtype + +import pandas as pd +from pandas.api.types import ( + is_array_like, + is_bool_dtype, + is_integer, + is_integer_dtype, + is_scalar, +) +from pandas.core.arrays.base import ExtensionArray +from pandas.core.indexers import check_array_indexer + + +def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: + scalar = arrow_scalar.as_py() + if scalar is None: + return libmissing.NA + else: + return scalar + + +@register_extension_dtype +class ArrowStringDtype(ExtensionDtype): + """ + Extension dtype for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.1.0 + + .. warning:: + + ArrowStringDtype is considered experimental. The implementation and + parts of the API may change without warning. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.ArrowStringDtype() + ArrowStringDtype + """ + + name = "arrow_string" + + #: StringDtype.na_value uses pandas.NA + na_value = libmissing.NA + + @property + def type(self) -> Type[str]: + return str + + @classmethod + def construct_array_type(cls) -> Type["ArrowStringArray"]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return ArrowStringArray + + def __hash__(self) -> int: + return hash("ArrowStringDtype") + + def __repr__(self) -> str: + return "ArrowStringDtype" + + def __from_arrow__( + self, array: Union["pa.Array", "pa.ChunkedArray"] + ) -> "ArrowStringArray": + """ + Construct StringArray from pyarrow Array/ChunkedArray. + """ + return ArrowStringArray(array) + + def __eq__(self, other) -> bool: + """Check whether 'other' is equal to self. + + By default, 'other' is considered equal if + * it's a string matching 'self.name'. + * it's an instance of this type. + + Parameters + ---------- + other : Any + + Returns + ------- + bool + """ + if isinstance(other, ArrowStringDtype): + return True + elif isinstance(other, str) and other == "arrow_string": + return True + else: + return False + + +class ArrowStringArray(ExtensionArray): + """ + Extension array for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.1.0 + + .. warning:: + + ArrowStringArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : pyarrow.Array or pyarrow.ChunkedArray + The array of data. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + array + The recommended function for creating a ArrowStringArray. + Series.str + The string methods are available on Series backed by + a ArrowStringArray. + + Notes + ----- + ArrowStringArray returns a BooleanArray for comparison methods. + + Examples + -------- + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") + + ['This is', 'some text', , 'data.'] + Length: 4, dtype: arrow_string + """ + + def __init__(self, values): + if isinstance(values, pa.Array): + self.data = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self.data = values + else: + raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + # TODO(ARROW-9407): Accept pd.NA in Arrow + scalars_corrected = [None if pd.isna(x) else x for x in scalars] + return cls(pa.array(scalars_corrected, type=pa.string())) + + @property + def dtype(self) -> ArrowStringDtype: + """ + An instance of 'ArrowStringDtype'. + """ + return ArrowStringDtype() + + def __array__(self, *args, **kwargs) -> "np.ndarray": + """Correctly construct numpy arrays when passed to `np.asarray()`.""" + return self.data.__array__(*args, **kwargs) + + def __arrow_array__(self, type=None): + """Convert myself to a pyarrow Array or ChunkedArray.""" + return self.data + + @property + def size(self) -> int: + """ + Return the number of elements in this array. + + Returns + ------- + size : int + """ + return len(self.data) + + @property + def shape(self) -> Tuple[int]: + """Return the shape of the data.""" + # This may be patched by pandas to support pseudo-2D operations. + return (len(self.data),) + + @property + def ndim(self) -> int: + """Return the number of dimensions of the underlying data.""" + return 1 + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self.data) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + def __getitem__(self, item): + # type (Any) -> Any + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, Iterable): + if not is_array_like(item): + item = np.array(item) + if len(item) == 0: + return type(self)(pa.chunked_array([], type=pa.string())) + elif is_integer_dtype(item): + return self.take(item) + elif is_bool_dtype(item): + return type(self)(self.data.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) + elif is_integer(item): + if item < 0: + item += len(self) + if item >= len(self): + raise IndexError("index out of bounds") + + value = self.data[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + return _as_pandas_scalar(value) + + def fillna(self, value=None, method=None, limit=None): + raise NotImplementedError("fillna") + + def _reduce(self, name, skipna=True, **kwargs): + if name in ["min", "max"]: + return getattr(self, name)(skipna=skipna) + + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + return self.data.nbytes + + def isna(self) -> np.ndarray: + """ + Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + # TODO: Implement .to_numpy for ChunkedArray + return self.data.is_null().to_pandas().values + + def copy(self) -> ExtensionArray: + """ + Return a copy of the array. + + Parameters + ---------- + deep : bool, default False + Also copy the underlying data backing this array. + + Returns + ------- + ExtensionArray + """ + return type(self)(self.data) + + def __eq__(self, other: Any) -> ArrayLike: + """ + Return for `self == other` (element-wise equality). + """ + if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): + return NotImplemented + if isinstance(other, ArrowStringArray): + result = pc.equal(self.data, other.data) + elif is_scalar(other): + result = pc.equal(self.data, pa.scalar(other)) + else: + raise NotImplementedError("Neither scalar nor ArrowStringArray") + + # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray + return pd.array(result.to_pandas().values) + + def __setitem__(self, key, value): + # type: (Union[int, np.ndarray], Any) -> None + """Set one or more values inplace. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + key = check_array_indexer(self, key) + + if is_integer(key): + if not pd.api.types.is_scalar(value): + raise ValueError("Must pass scalars with scalar indexer") + elif pd.isna(value): + value = None + elif not isinstance(value, str): + raise ValueError("Scalar must be NA or str") + + # Slice data and insert inbetween + new_data = [ + *self.data[0:key].chunks, + pa.array([value], type=pa.string()), + *self.data[(key + 1) :].chunks, + ] + self.data = pa.chunked_array(new_data) + else: + # Convert to integer indices and iteratively assign. + # TODO: Make a faster variant of this in Arrow upstream. + # This is probably extremely slow. + + # Convert all possible input key types to an array of integers + if is_bool_dtype(key): + # TODO(ARROW-9430): Directly support setitem(booleans) + key_array = np.argwhere(key).flatten() + elif isinstance(key, slice): + key_array = np.array(range(len(self))[key]) + else: + # TODO(ARROW-9431): Directly support setitem(integers) + key_array = np.asanyarray(key) + + if pd.api.types.is_scalar(value): + value = np.broadcast_to(value, len(key_array)) + else: + value = np.asarray(value) + + if len(key_array) != len(value): + raise ValueError("Length of indexer and values mismatch") + + for k, v in zip(key_array, value): + self[k] = v + + def take( + self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None + ) -> "ExtensionArray": + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + api.extensions.take + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + """ + # TODO: Remove once we got rid of the (indices < 0) check + if not is_array_like(indices): + indices_array = np.asanyarray(indices) + else: + indices_array = indices + + if len(self.data) == 0 and (indices_array >= 0).any(): + raise IndexError("cannot do a non-empty take") + if indices_array.max() >= len(self.data): + raise IndexError("out of bounds value in 'indices'.") + + if allow_fill: + if (indices_array < 0).any(): + # TODO(ARROW-9433): Treat negative indices as NULL + indices_array = pa.array(indices_array, mask=indices_array < 0) + result = self.data.take(indices_array) + if pd.isna(fill_value): + return type(self)(result) + return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + else: + # Nothing to fill + return type(self)(self.data.take(indices)) + else: # allow_fill=False + # TODO(ARROW-9432): Treat negative indices as indices from the right. + if (indices_array < 0).any(): + # Don't modify in-place + indices_array = np.copy(indices_array) + indices_array[indices_array < 0] += len(self.data) + return type(self)(self.data.take(indices_array)) diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py new file mode 100644 index 0000000000000..437d51060fb7f --- /dev/null +++ b/pandas/tests/extension/test_string_arrow.py @@ -0,0 +1,125 @@ +import string + +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype +from pandas.tests.extension import base + + +@pytest.fixture +def dtype(): + return ArrowStringDtype() + + +@pytest.fixture +def data(): + strings = np.random.choice(list(string.ascii_letters), size=100) + while strings[0] == strings[1]: + strings = np.random.choice(list(string.ascii_letters), size=100) + + return ArrowStringArray._from_sequence(strings) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return ArrowStringArray._from_sequence([pd.NA, "A"]) + + +@pytest.fixture +def data_for_sorting(): + return ArrowStringArray._from_sequence(["B", "C", "A"]) + + +@pytest.fixture +def data_missing_for_sorting(): + return ArrowStringArray._from_sequence(["B", pd.NA, "A"]) + + +@pytest.fixture +def na_value(): + return pd.NA + + +@pytest.fixture +def data_for_grouping(): + return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + @pytest.mark.xfail(reason="Fails until implement, remove before merge") + def test_view(self, data): + base.BaseInterfaceTests.test_view(self, data) + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +# class TestReshaping(base.BaseReshapingTests): +# pass + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestSetitem(base.BaseSetitemTests): + pass + + +# class TestMissing(base.BaseMissingTests): +# pass + + +# class TestNoReduce(base.BaseNoReduceTests): +# @pytest.mark.parametrize("skipna", [True, False]) +# def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): +# op_name = all_numeric_reductions +# +# if op_name in ["min", "max"]: +# return None +# +# s = pd.Series(data) +# with pytest.raises(TypeError): +# getattr(s, op_name)(skipna=skipna) + + +# class TestMethods(base.BaseMethodsTests): +# @pytest.mark.skip(reason="returns nullable") +# def test_value_counts(self, all_data, dropna): +# return super().test_value_counts(all_data, dropna) + + +# class TestCasting(base.BaseCastingTests): +# pass + + +# class TestComparisonOps(base.BaseComparisonOpsTests): +# def _compare_other(self, s, data, op_name, other): +# result = getattr(s, op_name)(other) +# expected = getattr(s.astype(object), op_name)(other).astype("boolean") +# self.assert_series_equal(result, expected) + +# def test_compare_scalar(self, data, all_compare_operators): +# op_name = all_compare_operators +# s = pd.Series(data) +# self._compare_other(s, data, op_name, "abc") + + +# class TestParsing(base.BaseParsingTests): +# pass + + +# class TestPrinting(base.BasePrintingTests): +# pass + + +# class TestGroupBy(base.BaseGroupbyTests): +# pass diff --git a/setup.py b/setup.py index f6f0cd9aabc0e..4033ea2935de5 100755 --- a/setup.py +++ b/setup.py @@ -432,7 +432,7 @@ def run(self): extra_compile_args.append("/Z7") extra_link_args.append("/DEBUG") else: - extra_compile_args = ["-Werror"] + extra_compile_args = [] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g") From d242f2d0bc2d0eae9481ce2fa09969d9eb20113c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 3 Sep 2020 15:32:45 -0500 Subject: [PATCH 2/2] Refactor to use parametrized StringDtype --- pandas/core/arrays/base.py | 13 +- pandas/core/arrays/string_.py | 90 +++++++++- pandas/core/arrays/string_arrow.py | 166 +++++++----------- pandas/core/config_init.py | 13 ++ pandas/core/strings.py | 10 +- .../tests/arrays/string_/test_string_arrow.py | 26 +++ pandas/tests/extension/arrow/test_string.py | 7 +- pandas/tests/extension/test_string_arrow.py | 103 +++++++---- setup.py | 2 +- 9 files changed, 261 insertions(+), 169 deletions(-) create mode 100644 pandas/tests/arrays/string_/test_string_arrow.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 736d95b4b64b6..9b1b2c0d74e3f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -356,6 +356,8 @@ def __ne__(self, other: Any) -> ArrayLike: """ Return for `self != other` (element-wise in-equality). """ + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented return ~(self == other) def to_numpy( @@ -457,13 +459,10 @@ def astype(self, dtype, copy=True): NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) # FIXME: Really hard-code here? - if isinstance( - dtype, (ArrowStringDtype, StringDtype) - ): # allow conversion to StringArrays + if isinstance(dtype, StringDtype): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) @@ -928,9 +927,9 @@ def take( from the right (the default). This is similar to :func:`numpy.take`. - * True: negative values in `indices` indicate - missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. + * True: ``-1`` in `indices` indicate missing values. + These values are set to `fill_value`. Any other other negative + value raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 381968f9724b6..0e7c5a8036bcf 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,8 +1,10 @@ import operator -from typing import TYPE_CHECKING, Type, Union +from typing import TYPE_CHECKING, Any, Type, Union import numpy as np +from pandas._config import get_option + from pandas._libs import lib, missing as libmissing from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype @@ -50,17 +52,83 @@ class StringDtype(ExtensionDtype): StringDtype """ - name = "string" - #: StringDtype.na_value uses pandas.NA na_value = libmissing.NA + _metadata = ("storage",) + + def __init__(self, storage=None): + if storage is None: + storage = get_option("mode.string_storage") + if storage not in {"python", "pyarrow"}: + raise ValueError( + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." + ) + self.storage = storage + + @property + def name(self): + return f"StringDtype[{self.storage}]" @property def type(self) -> Type[str]: return str @classmethod - def construct_array_type(cls) -> Type["StringArray"]: + def construct_from_string(cls, string): + """ + Construct a StringDtype from a string. + + Parameters + ---------- + string : str + The type of the name. The storage type will be taking from `string`. + Valid options and their storage types are + + ========================== ============== + string result storage + ========================== ============== + ``'string'`` global default + ``'string[python]'`` python + ``'StringDtype[python]'`` python + ``'string[pyarrow]'`` pyarrow + ``'StringDtype[pyarrow]'`` pyarrow + ========================== ============= + + Returns + ------- + StringDtype + + Raise + ----- + TypeError + If the string is not a valid option. + + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + if string == "string": + # TODO: use global default + return cls() + elif string in {"string[python]", "StringDtype[python]"}: + return cls(storage="python") + elif string in {"string[pyarrow]", "StringDtype[pyarrow]"}: + return cls(storage="pyarrow") + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + def __eq__(self, other: Any) -> bool: + if isinstance(other, str) and other == "string": + return True + return super().__eq__(other) + + def __hash__(self) -> int: + # custom __eq__ so have to override __hash__ + return super().__hash__() + + # XXX: this is a classmethod, but we need to know the storage type. + def construct_array_type(self) -> Type["StringArray"]: """ Return the array type associated with this dtype. @@ -68,10 +136,15 @@ def construct_array_type(cls) -> Type["StringArray"]: ------- type """ - return StringArray + from .string_arrow import ArrowStringArray + + if self.storage == "python": + return StringArray + else: + return ArrowStringArray - def __repr__(self) -> str: - return "StringDtype" + def __repr__(self): + return self.name def __from_arrow__( self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] @@ -80,6 +153,7 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from .string_arrow import ArrowStringArray if isinstance(array, pyarrow.Array): chunks = [array] @@ -93,7 +167,7 @@ def __from_arrow__( str_arr = StringArray._from_sequence(np.array(arr)) results.append(str_arr) - return StringArray._concat_same_type(results) + return ArrowStringArray._concat_same_type(results) class StringArray(PandasArray): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8248a3e91c0fe..c0831a65b3644 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,5 @@ from collections.abc import Iterable -from typing import Any, Optional, Sequence, Tuple, Type, Union +from typing import Any, Optional, Sequence, Tuple, Union import numpy as np import pyarrow as pa @@ -8,18 +8,19 @@ from pandas._libs import missing as libmissing from pandas._typing import ArrayLike -from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna -import pandas as pd from pandas.api.types import ( is_array_like, is_bool_dtype, + is_int64_dtype, is_integer, is_integer_dtype, is_scalar, ) +from pandas.core.algorithms import factorize from pandas.core.arrays.base import ExtensionArray +from pandas.core.arrays.string_ import StringDtype from pandas.core.indexers import check_array_indexer @@ -31,89 +32,6 @@ def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: return scalar -@register_extension_dtype -class ArrowStringDtype(ExtensionDtype): - """ - Extension dtype for string data in a ``pyarrow.ChunkedArray``. - - .. versionadded:: 1.1.0 - - .. warning:: - - ArrowStringDtype is considered experimental. The implementation and - parts of the API may change without warning. - - Attributes - ---------- - None - - Methods - ------- - None - - Examples - -------- - >>> pd.ArrowStringDtype() - ArrowStringDtype - """ - - name = "arrow_string" - - #: StringDtype.na_value uses pandas.NA - na_value = libmissing.NA - - @property - def type(self) -> Type[str]: - return str - - @classmethod - def construct_array_type(cls) -> Type["ArrowStringArray"]: - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - return ArrowStringArray - - def __hash__(self) -> int: - return hash("ArrowStringDtype") - - def __repr__(self) -> str: - return "ArrowStringDtype" - - def __from_arrow__( - self, array: Union["pa.Array", "pa.ChunkedArray"] - ) -> "ArrowStringArray": - """ - Construct StringArray from pyarrow Array/ChunkedArray. - """ - return ArrowStringArray(array) - - def __eq__(self, other) -> bool: - """Check whether 'other' is equal to self. - - By default, 'other' is considered equal if - * it's a string matching 'self.name'. - * it's an instance of this type. - - Parameters - ---------- - other : Any - - Returns - ------- - bool - """ - if isinstance(other, ArrowStringDtype): - return True - elif isinstance(other, str) and other == "arrow_string": - return True - else: - return False - - class ArrowStringArray(ExtensionArray): """ Extension array for string data in a ``pyarrow.ChunkedArray``. @@ -165,19 +83,20 @@ def __init__(self, values): self.data = values else: raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + self._dtype = StringDtype(storage="pyarrow") @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): # TODO(ARROW-9407): Accept pd.NA in Arrow - scalars_corrected = [None if pd.isna(x) else x for x in scalars] + scalars_corrected = [None if isna(x) else x for x in scalars] return cls(pa.array(scalars_corrected, type=pa.string())) @property - def dtype(self) -> ArrowStringDtype: + def dtype(self) -> StringDtype: """ - An instance of 'ArrowStringDtype'. + An instance of 'StringDtype'. """ - return ArrowStringDtype() + return self._dtype def __array__(self, *args, **kwargs) -> "np.ndarray": """Correctly construct numpy arrays when passed to `np.asarray()`.""" @@ -276,15 +195,6 @@ def __getitem__(self, item): else: return _as_pandas_scalar(value) - def fillna(self, value=None, method=None, limit=None): - raise NotImplementedError("fillna") - - def _reduce(self, name, skipna=True, **kwargs): - if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna) - - raise TypeError(f"Cannot perform reduction '{name}' with string dtype") - @property def nbytes(self) -> int: """ @@ -320,7 +230,9 @@ def __eq__(self, other: Any) -> ArrayLike: """ Return for `self == other` (element-wise equality). """ - if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)): + from pandas import array, Series, DataFrame, Index + + if isinstance(other, (Series, DataFrame, Index)): return NotImplemented if isinstance(other, ArrowStringArray): result = pc.equal(self.data, other.data) @@ -330,7 +242,7 @@ def __eq__(self, other: Any) -> ArrayLike: raise NotImplementedError("Neither scalar nor ArrowStringArray") # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray - return pd.array(result.to_pandas().values) + return array(result.to_pandas().values, dtype="boolean") def __setitem__(self, key, value): # type: (Union[int, np.ndarray], Any) -> None @@ -357,9 +269,9 @@ def __setitem__(self, key, value): key = check_array_indexer(self, key) if is_integer(key): - if not pd.api.types.is_scalar(value): + if not is_scalar(value): raise ValueError("Must pass scalars with scalar indexer") - elif pd.isna(value): + elif isna(value): value = None elif not isinstance(value, str): raise ValueError("Scalar must be NA or str") @@ -386,7 +298,7 @@ def __setitem__(self, key, value): # TODO(ARROW-9431): Directly support setitem(integers) key_array = np.asanyarray(key) - if pd.api.types.is_scalar(value): + if is_scalar(value): value = np.broadcast_to(value, len(key_array)) else: value = np.asarray(value) @@ -461,15 +373,20 @@ def take( if len(self.data) == 0 and (indices_array >= 0).any(): raise IndexError("cannot do a non-empty take") - if indices_array.max() >= len(self.data): + if len(indices_array) > 0 and indices_array.max() >= len(self.data): raise IndexError("out of bounds value in 'indices'.") if allow_fill: if (indices_array < 0).any(): + if indices_array.min() < -1: + raise ValueError( + "'indicies' contains negative values other " + "-1 with 'allow_fill=True." + ) # TODO(ARROW-9433): Treat negative indices as NULL indices_array = pa.array(indices_array, mask=indices_array < 0) result = self.data.take(indices_array) - if pd.isna(fill_value): + if isna(fill_value): return type(self)(result) return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: @@ -482,3 +399,38 @@ def take( indices_array = np.copy(indices_array) indices_array[indices_array < 0] += len(self.data) return type(self)(self.data.take(indices_array)) + + def value_counts(self, dropna=True): + from pandas import Series + + if dropna: + na = self.isna() + self = self[~na] + counts = self.data.value_counts() + return Series(counts.field(1), counts.field(0)) + + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: + # see https://github.com/xhochy/fletcher/blob/master/fletcher/base.py + # doesn't handle dictionary types. + if self.data.num_chunks == 1: + encoded = self.data.chunk(0).dictionary_encode() + indices = encoded.indices.to_pandas() + if indices.dtype.kind == "f": + indices[np.isnan(indices)] = na_sentinel + indices = indices.astype(int) + if not is_int64_dtype(indices): + indices = indices.astype(np.int64) + return indices.values, type(self)(encoded.dictionary) + else: + np_array = self.data.to_pandas().values + return factorize(np_array, na_sentinel=na_sentinel) + + @classmethod + def _concat_same_type( + cls, to_concat: Sequence["ArrowStringArray"] + ) -> "ArrowStringArray": + return cls( + pa.chunked_array( + [array for ea in to_concat for array in ea.data.iterchunks()] + ) + ) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 0c23f1b4bcdf2..a58e6eccf7644 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -504,6 +504,19 @@ def use_inf_as_na_cb(key): ) +string_storage_doc = """ +: string + The default storage for StringDtype. +""" + +with cf.config_prefix("mode"): + cf.register_option( + "string_storage", + "python", + string_storage_doc, + validator=is_one_of_factory(["python", "pyarrow"]), + ) + # Set up the io.excel specific reader configuration. reader_engine_doc = """ : string diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 6702bf519c52e..59aa8fc5cfa0e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -901,8 +901,10 @@ def _result_dtype(arr): # workaround #27953 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails # when the list of values is empty. - if arr.dtype.name == "string": - return "string" + from pandas.core.arrays.string_ import StringDtype + + if isinstance(arr.dtype.name, StringDtype): + return arr.dtype.name else: return object @@ -2097,9 +2099,11 @@ class StringMethods(NoNewAttributesMixin): """ def __init__(self, data): + from pandas.core.arrays.string_ import StringDtype + self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = data.dtype.name == "string" + self._is_string = isinstance(data.dtype, StringDtype) # ._values.categories works for both Series/Index self._parent = data._values.categories if self._is_categorical else data diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py new file mode 100644 index 0000000000000..40e3f21670ea0 --- /dev/null +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -0,0 +1,26 @@ +import pytest + +import pandas as pd +import pandas.testing as tm + + +def test_eq_all_na(): + a = pd.array([pd.NA, pd.NA], dtype=pd.StringDtype("pyarrow")) + result = a == a + expected = pd.array([pd.NA, pd.NA], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_config(): + # python by default + assert pd.StringDtype().storage == "python" + arr = pd.array(["a", "b"]) + assert arr.dtype.storage == "python" + + with pd.option_context("mode.string_storage", "pyarrow"): + assert pd.StringDtype().storage == "pyarrow" + arr = pd.array(["a", "b"]) + assert arr.dtype.storage == "pyarrow" + + with pytest.raises(ValueError): + pd.options.mode.string_storage = "foo" diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py index abd5c1f386dc5..f32f1e415ddc7 100644 --- a/pandas/tests/extension/arrow/test_string.py +++ b/pandas/tests/extension/arrow/test_string.py @@ -4,10 +4,9 @@ pytest.importorskip("pyarrow", minversion="0.13.0") -from .arrays import ArrowStringDtype # isort:skip - def test_constructor_from_list(): # GH 27673 - result = pd.Series(["E"], dtype=ArrowStringDtype()) - assert isinstance(result.dtype, ArrowStringDtype) + result = pd.Series(["E"], dtype=pd.StringDtype(storage="pyarrow")) + assert isinstance(result.dtype, pd.StringDtype) + assert result.dtype.storage == "pyarrow" diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py index 437d51060fb7f..848e8a435b530 100644 --- a/pandas/tests/extension/test_string_arrow.py +++ b/pandas/tests/extension/test_string_arrow.py @@ -4,13 +4,13 @@ import pytest import pandas as pd -from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype +from pandas.core.arrays.string_arrow import ArrowStringArray from pandas.tests.extension import base @pytest.fixture def dtype(): - return ArrowStringDtype() + return pd.StringDtype(storage="pyarrow") @pytest.fixture @@ -62,64 +62,89 @@ class TestConstructors(base.BaseConstructorsTests): pass -# class TestReshaping(base.BaseReshapingTests): -# pass +class TestReshaping(base.BaseReshapingTests): + pass class TestGetitem(base.BaseGetitemTests): - pass + @pytest.mark.xfail( + reason="pyarrow.lib.ArrowNotImplementedError: Function " + "fill_null has no kernel matching input types " + "(array[string], scalar[string])" + ) + def test_take_non_na_fill_value(self, data_missing): + super().test_take_non_na_fill_value(data_missing) + + @pytest.mark.xfail( + reason="pyarrow.lib.ArrowNotImplementedError: Function fill_null has no " + "kernel matching input types (array[string], scalar[string])" + ) + def test_reindex_non_na_fill_value(self, data_missing): + super().test_reindex_non_na_fill_value(self, data_missing) class TestSetitem(base.BaseSetitemTests): + @pytest.mark.xfail(reason="TODO") + def test_setitem_preserves_views(self, data): + # Unclear where the issue is (pyarrow getitem, our getitem, our slice) + # and what to do here. + super().test_setitem_preserves_views(data) + + +class TestMissing(base.BaseMissingTests): pass -# class TestMissing(base.BaseMissingTests): -# pass +class TestNoReduce(base.BaseNoReduceTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + if op_name in ["min", "max"]: + return None -# class TestNoReduce(base.BaseNoReduceTests): -# @pytest.mark.parametrize("skipna", [True, False]) -# def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): -# op_name = all_numeric_reductions -# -# if op_name in ["min", "max"]: -# return None -# -# s = pd.Series(data) -# with pytest.raises(TypeError): -# getattr(s, op_name)(skipna=skipna) + s = pd.Series(data) + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) -# class TestMethods(base.BaseMethodsTests): -# @pytest.mark.skip(reason="returns nullable") -# def test_value_counts(self, all_data, dropna): -# return super().test_value_counts(all_data, dropna) +class TestMethods(base.BaseMethodsTests): + @pytest.mark.skip(reason="returns nullable") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) -# class TestCasting(base.BaseCastingTests): -# pass +class TestCasting(base.BaseCastingTests): + pass -# class TestComparisonOps(base.BaseComparisonOpsTests): -# def _compare_other(self, s, data, op_name, other): -# result = getattr(s, op_name)(other) -# expected = getattr(s.astype(object), op_name)(other).astype("boolean") -# self.assert_series_equal(result, expected) +class TestComparisonOps(base.BaseComparisonOpsTests): + def _compare_other(self, s, data, op_name, other): + if op_name not in {"__eq__", "__ne__"}: + pytest.skip(f"{op_name} is not implemented.") + result = getattr(s, op_name)(other) + expected = getattr(s.astype(object), op_name)(other).astype("boolean") + self.assert_series_equal(result, expected) -# def test_compare_scalar(self, data, all_compare_operators): -# op_name = all_compare_operators -# s = pd.Series(data) -# self._compare_other(s, data, op_name, "abc") + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + self._compare_other(s, data, op_name, "abc") + def test_compare_array(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + other = pd.Series([data[0]] * len(data), dtype=data.dtype) + self._compare_other(s, data, op_name, other) -# class TestParsing(base.BaseParsingTests): -# pass + +class TestParsing(base.BaseParsingTests): + pass -# class TestPrinting(base.BasePrintingTests): -# pass +class TestPrinting(base.BasePrintingTests): + pass -# class TestGroupBy(base.BaseGroupbyTests): -# pass +class TestGroupBy(base.BaseGroupbyTests): + pass diff --git a/setup.py b/setup.py index 4033ea2935de5..f6f0cd9aabc0e 100755 --- a/setup.py +++ b/setup.py @@ -432,7 +432,7 @@ def run(self): extra_compile_args.append("/Z7") extra_link_args.append("/DEBUG") else: - extra_compile_args = [] + extra_compile_args = ["-Werror"] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g")