diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8193d65b3b30c..9b1b2c0d74e3f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -356,6 +356,8 @@ def __ne__(self, other: Any) -> ArrayLike: """ Return for `self != other` (element-wise in-equality). """ + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented return ~(self == other) def to_numpy( @@ -459,6 +461,7 @@ def astype(self, dtype, copy=True): from pandas.core.arrays.string_ import StringDtype dtype = pandas_dtype(dtype) + # FIXME: Really hard-code here? if isinstance(dtype, StringDtype): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) @@ -924,9 +927,9 @@ def take( from the right (the default). This is similar to :func:`numpy.take`. - * True: negative values in `indices` indicate - missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. + * True: ``-1`` in `indices` indicate missing values. + These values are set to `fill_value`. Any other other negative + value raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 381968f9724b6..0e7c5a8036bcf 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,8 +1,10 @@ import operator -from typing import TYPE_CHECKING, Type, Union +from typing import TYPE_CHECKING, Any, Type, Union import numpy as np +from pandas._config import get_option + from pandas._libs import lib, missing as libmissing from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype @@ -50,17 +52,83 @@ class StringDtype(ExtensionDtype): StringDtype """ - name = "string" - #: StringDtype.na_value uses pandas.NA na_value = libmissing.NA + _metadata = ("storage",) + + def __init__(self, storage=None): + if storage is None: + storage = get_option("mode.string_storage") + if storage not in {"python", "pyarrow"}: + raise ValueError( + f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." + ) + self.storage = storage + + @property + def name(self): + return f"StringDtype[{self.storage}]" @property def type(self) -> Type[str]: return str @classmethod - def construct_array_type(cls) -> Type["StringArray"]: + def construct_from_string(cls, string): + """ + Construct a StringDtype from a string. + + Parameters + ---------- + string : str + The type of the name. The storage type will be taking from `string`. + Valid options and their storage types are + + ========================== ============== + string result storage + ========================== ============== + ``'string'`` global default + ``'string[python]'`` python + ``'StringDtype[python]'`` python + ``'string[pyarrow]'`` pyarrow + ``'StringDtype[pyarrow]'`` pyarrow + ========================== ============= + + Returns + ------- + StringDtype + + Raise + ----- + TypeError + If the string is not a valid option. + + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + if string == "string": + # TODO: use global default + return cls() + elif string in {"string[python]", "StringDtype[python]"}: + return cls(storage="python") + elif string in {"string[pyarrow]", "StringDtype[pyarrow]"}: + return cls(storage="pyarrow") + else: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + + def __eq__(self, other: Any) -> bool: + if isinstance(other, str) and other == "string": + return True + return super().__eq__(other) + + def __hash__(self) -> int: + # custom __eq__ so have to override __hash__ + return super().__hash__() + + # XXX: this is a classmethod, but we need to know the storage type. + def construct_array_type(self) -> Type["StringArray"]: """ Return the array type associated with this dtype. @@ -68,10 +136,15 @@ def construct_array_type(cls) -> Type["StringArray"]: ------- type """ - return StringArray + from .string_arrow import ArrowStringArray + + if self.storage == "python": + return StringArray + else: + return ArrowStringArray - def __repr__(self) -> str: - return "StringDtype" + def __repr__(self): + return self.name def __from_arrow__( self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] @@ -80,6 +153,7 @@ def __from_arrow__( Construct StringArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from .string_arrow import ArrowStringArray if isinstance(array, pyarrow.Array): chunks = [array] @@ -93,7 +167,7 @@ def __from_arrow__( str_arr = StringArray._from_sequence(np.array(arr)) results.append(str_arr) - return StringArray._concat_same_type(results) + return ArrowStringArray._concat_same_type(results) class StringArray(PandasArray): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py new file mode 100644 index 0000000000000..c0831a65b3644 --- /dev/null +++ b/pandas/core/arrays/string_arrow.py @@ -0,0 +1,436 @@ +from collections.abc import Iterable +from typing import Any, Optional, Sequence, Tuple, Union + +import numpy as np +import pyarrow as pa +import pyarrow.compute as pc + +from pandas._libs import missing as libmissing +from pandas._typing import ArrayLike + +from pandas.core.dtypes.missing import isna + +from pandas.api.types import ( + is_array_like, + is_bool_dtype, + is_int64_dtype, + is_integer, + is_integer_dtype, + is_scalar, +) +from pandas.core.algorithms import factorize +from pandas.core.arrays.base import ExtensionArray +from pandas.core.arrays.string_ import StringDtype +from pandas.core.indexers import check_array_indexer + + +def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]: + scalar = arrow_scalar.as_py() + if scalar is None: + return libmissing.NA + else: + return scalar + + +class ArrowStringArray(ExtensionArray): + """ + Extension array for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.1.0 + + .. warning:: + + ArrowStringArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : pyarrow.Array or pyarrow.ChunkedArray + The array of data. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + array + The recommended function for creating a ArrowStringArray. + Series.str + The string methods are available on Series backed by + a ArrowStringArray. + + Notes + ----- + ArrowStringArray returns a BooleanArray for comparison methods. + + Examples + -------- + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") + + ['This is', 'some text', , 'data.'] + Length: 4, dtype: arrow_string + """ + + def __init__(self, values): + if isinstance(values, pa.Array): + self.data = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self.data = values + else: + raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + self._dtype = StringDtype(storage="pyarrow") + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + # TODO(ARROW-9407): Accept pd.NA in Arrow + scalars_corrected = [None if isna(x) else x for x in scalars] + return cls(pa.array(scalars_corrected, type=pa.string())) + + @property + def dtype(self) -> StringDtype: + """ + An instance of 'StringDtype'. + """ + return self._dtype + + def __array__(self, *args, **kwargs) -> "np.ndarray": + """Correctly construct numpy arrays when passed to `np.asarray()`.""" + return self.data.__array__(*args, **kwargs) + + def __arrow_array__(self, type=None): + """Convert myself to a pyarrow Array or ChunkedArray.""" + return self.data + + @property + def size(self) -> int: + """ + Return the number of elements in this array. + + Returns + ------- + size : int + """ + return len(self.data) + + @property + def shape(self) -> Tuple[int]: + """Return the shape of the data.""" + # This may be patched by pandas to support pseudo-2D operations. + return (len(self.data),) + + @property + def ndim(self) -> int: + """Return the number of dimensions of the underlying data.""" + return 1 + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self.data) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + def __getitem__(self, item): + # type (Any) -> Any + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, Iterable): + if not is_array_like(item): + item = np.array(item) + if len(item) == 0: + return type(self)(pa.chunked_array([], type=pa.string())) + elif is_integer_dtype(item): + return self.take(item) + elif is_bool_dtype(item): + return type(self)(self.data.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) + elif is_integer(item): + if item < 0: + item += len(self) + if item >= len(self): + raise IndexError("index out of bounds") + + value = self.data[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + return _as_pandas_scalar(value) + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + return self.data.nbytes + + def isna(self) -> np.ndarray: + """ + Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + # TODO: Implement .to_numpy for ChunkedArray + return self.data.is_null().to_pandas().values + + def copy(self) -> ExtensionArray: + """ + Return a copy of the array. + + Parameters + ---------- + deep : bool, default False + Also copy the underlying data backing this array. + + Returns + ------- + ExtensionArray + """ + return type(self)(self.data) + + def __eq__(self, other: Any) -> ArrayLike: + """ + Return for `self == other` (element-wise equality). + """ + from pandas import array, Series, DataFrame, Index + + if isinstance(other, (Series, DataFrame, Index)): + return NotImplemented + if isinstance(other, ArrowStringArray): + result = pc.equal(self.data, other.data) + elif is_scalar(other): + result = pc.equal(self.data, pa.scalar(other)) + else: + raise NotImplementedError("Neither scalar nor ArrowStringArray") + + # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray + return array(result.to_pandas().values, dtype="boolean") + + def __setitem__(self, key, value): + # type: (Union[int, np.ndarray], Any) -> None + """Set one or more values inplace. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + key = check_array_indexer(self, key) + + if is_integer(key): + if not is_scalar(value): + raise ValueError("Must pass scalars with scalar indexer") + elif isna(value): + value = None + elif not isinstance(value, str): + raise ValueError("Scalar must be NA or str") + + # Slice data and insert inbetween + new_data = [ + *self.data[0:key].chunks, + pa.array([value], type=pa.string()), + *self.data[(key + 1) :].chunks, + ] + self.data = pa.chunked_array(new_data) + else: + # Convert to integer indices and iteratively assign. + # TODO: Make a faster variant of this in Arrow upstream. + # This is probably extremely slow. + + # Convert all possible input key types to an array of integers + if is_bool_dtype(key): + # TODO(ARROW-9430): Directly support setitem(booleans) + key_array = np.argwhere(key).flatten() + elif isinstance(key, slice): + key_array = np.array(range(len(self))[key]) + else: + # TODO(ARROW-9431): Directly support setitem(integers) + key_array = np.asanyarray(key) + + if is_scalar(value): + value = np.broadcast_to(value, len(key_array)) + else: + value = np.asarray(value) + + if len(key_array) != len(value): + raise ValueError("Length of indexer and values mismatch") + + for k, v in zip(key_array, value): + self[k] = v + + def take( + self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None + ) -> "ExtensionArray": + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + api.extensions.take + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + """ + # TODO: Remove once we got rid of the (indices < 0) check + if not is_array_like(indices): + indices_array = np.asanyarray(indices) + else: + indices_array = indices + + if len(self.data) == 0 and (indices_array >= 0).any(): + raise IndexError("cannot do a non-empty take") + if len(indices_array) > 0 and indices_array.max() >= len(self.data): + raise IndexError("out of bounds value in 'indices'.") + + if allow_fill: + if (indices_array < 0).any(): + if indices_array.min() < -1: + raise ValueError( + "'indicies' contains negative values other " + "-1 with 'allow_fill=True." + ) + # TODO(ARROW-9433): Treat negative indices as NULL + indices_array = pa.array(indices_array, mask=indices_array < 0) + result = self.data.take(indices_array) + if isna(fill_value): + return type(self)(result) + return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + else: + # Nothing to fill + return type(self)(self.data.take(indices)) + else: # allow_fill=False + # TODO(ARROW-9432): Treat negative indices as indices from the right. + if (indices_array < 0).any(): + # Don't modify in-place + indices_array = np.copy(indices_array) + indices_array[indices_array < 0] += len(self.data) + return type(self)(self.data.take(indices_array)) + + def value_counts(self, dropna=True): + from pandas import Series + + if dropna: + na = self.isna() + self = self[~na] + counts = self.data.value_counts() + return Series(counts.field(1), counts.field(0)) + + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: + # see https://github.com/xhochy/fletcher/blob/master/fletcher/base.py + # doesn't handle dictionary types. + if self.data.num_chunks == 1: + encoded = self.data.chunk(0).dictionary_encode() + indices = encoded.indices.to_pandas() + if indices.dtype.kind == "f": + indices[np.isnan(indices)] = na_sentinel + indices = indices.astype(int) + if not is_int64_dtype(indices): + indices = indices.astype(np.int64) + return indices.values, type(self)(encoded.dictionary) + else: + np_array = self.data.to_pandas().values + return factorize(np_array, na_sentinel=na_sentinel) + + @classmethod + def _concat_same_type( + cls, to_concat: Sequence["ArrowStringArray"] + ) -> "ArrowStringArray": + return cls( + pa.chunked_array( + [array for ea in to_concat for array in ea.data.iterchunks()] + ) + ) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 0c23f1b4bcdf2..a58e6eccf7644 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -504,6 +504,19 @@ def use_inf_as_na_cb(key): ) +string_storage_doc = """ +: string + The default storage for StringDtype. +""" + +with cf.config_prefix("mode"): + cf.register_option( + "string_storage", + "python", + string_storage_doc, + validator=is_one_of_factory(["python", "pyarrow"]), + ) + # Set up the io.excel specific reader configuration. reader_engine_doc = """ : string diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 6702bf519c52e..59aa8fc5cfa0e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -901,8 +901,10 @@ def _result_dtype(arr): # workaround #27953 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails # when the list of values is empty. - if arr.dtype.name == "string": - return "string" + from pandas.core.arrays.string_ import StringDtype + + if isinstance(arr.dtype.name, StringDtype): + return arr.dtype.name else: return object @@ -2097,9 +2099,11 @@ class StringMethods(NoNewAttributesMixin): """ def __init__(self, data): + from pandas.core.arrays.string_ import StringDtype + self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = data.dtype.name == "string" + self._is_string = isinstance(data.dtype, StringDtype) # ._values.categories works for both Series/Index self._parent = data._values.categories if self._is_categorical else data diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py new file mode 100644 index 0000000000000..40e3f21670ea0 --- /dev/null +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -0,0 +1,26 @@ +import pytest + +import pandas as pd +import pandas.testing as tm + + +def test_eq_all_na(): + a = pd.array([pd.NA, pd.NA], dtype=pd.StringDtype("pyarrow")) + result = a == a + expected = pd.array([pd.NA, pd.NA], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_config(): + # python by default + assert pd.StringDtype().storage == "python" + arr = pd.array(["a", "b"]) + assert arr.dtype.storage == "python" + + with pd.option_context("mode.string_storage", "pyarrow"): + assert pd.StringDtype().storage == "pyarrow" + arr = pd.array(["a", "b"]) + assert arr.dtype.storage == "pyarrow" + + with pytest.raises(ValueError): + pd.options.mode.string_storage = "foo" diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py index abd5c1f386dc5..f32f1e415ddc7 100644 --- a/pandas/tests/extension/arrow/test_string.py +++ b/pandas/tests/extension/arrow/test_string.py @@ -4,10 +4,9 @@ pytest.importorskip("pyarrow", minversion="0.13.0") -from .arrays import ArrowStringDtype # isort:skip - def test_constructor_from_list(): # GH 27673 - result = pd.Series(["E"], dtype=ArrowStringDtype()) - assert isinstance(result.dtype, ArrowStringDtype) + result = pd.Series(["E"], dtype=pd.StringDtype(storage="pyarrow")) + assert isinstance(result.dtype, pd.StringDtype) + assert result.dtype.storage == "pyarrow" diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py new file mode 100644 index 0000000000000..848e8a435b530 --- /dev/null +++ b/pandas/tests/extension/test_string_arrow.py @@ -0,0 +1,150 @@ +import string + +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.string_arrow import ArrowStringArray +from pandas.tests.extension import base + + +@pytest.fixture +def dtype(): + return pd.StringDtype(storage="pyarrow") + + +@pytest.fixture +def data(): + strings = np.random.choice(list(string.ascii_letters), size=100) + while strings[0] == strings[1]: + strings = np.random.choice(list(string.ascii_letters), size=100) + + return ArrowStringArray._from_sequence(strings) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return ArrowStringArray._from_sequence([pd.NA, "A"]) + + +@pytest.fixture +def data_for_sorting(): + return ArrowStringArray._from_sequence(["B", "C", "A"]) + + +@pytest.fixture +def data_missing_for_sorting(): + return ArrowStringArray._from_sequence(["B", pd.NA, "A"]) + + +@pytest.fixture +def na_value(): + return pd.NA + + +@pytest.fixture +def data_for_grouping(): + return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + @pytest.mark.xfail(reason="Fails until implement, remove before merge") + def test_view(self, data): + base.BaseInterfaceTests.test_view(self, data) + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + pass + + +class TestGetitem(base.BaseGetitemTests): + @pytest.mark.xfail( + reason="pyarrow.lib.ArrowNotImplementedError: Function " + "fill_null has no kernel matching input types " + "(array[string], scalar[string])" + ) + def test_take_non_na_fill_value(self, data_missing): + super().test_take_non_na_fill_value(data_missing) + + @pytest.mark.xfail( + reason="pyarrow.lib.ArrowNotImplementedError: Function fill_null has no " + "kernel matching input types (array[string], scalar[string])" + ) + def test_reindex_non_na_fill_value(self, data_missing): + super().test_reindex_non_na_fill_value(self, data_missing) + + +class TestSetitem(base.BaseSetitemTests): + @pytest.mark.xfail(reason="TODO") + def test_setitem_preserves_views(self, data): + # Unclear where the issue is (pyarrow getitem, our getitem, our slice) + # and what to do here. + super().test_setitem_preserves_views(data) + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestNoReduce(base.BaseNoReduceTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + + if op_name in ["min", "max"]: + return None + + s = pd.Series(data) + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.skip(reason="returns nullable") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) + + +class TestCasting(base.BaseCastingTests): + pass + + +class TestComparisonOps(base.BaseComparisonOpsTests): + def _compare_other(self, s, data, op_name, other): + if op_name not in {"__eq__", "__ne__"}: + pytest.skip(f"{op_name} is not implemented.") + result = getattr(s, op_name)(other) + expected = getattr(s.astype(object), op_name)(other).astype("boolean") + self.assert_series_equal(result, expected) + + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + self._compare_other(s, data, op_name, "abc") + + def test_compare_array(self, data, all_compare_operators): + op_name = all_compare_operators + s = pd.Series(data) + other = pd.Series([data[0]] * len(data), dtype=data.dtype) + self._compare_other(s, data, op_name, other) + + +class TestParsing(base.BaseParsingTests): + pass + + +class TestPrinting(base.BasePrintingTests): + pass + + +class TestGroupBy(base.BaseGroupbyTests): + pass