diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 5adc8540e6864..7ec74b7045437 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -748,7 +748,7 @@ Strings ^^^^^^^ - Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`~arrays.StringArray` when the original had zero chunks (:issue:`41040`) -- +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` ignoring replacements with ``regex=True`` for ``StringDType`` data (:issue:`41333`, :issue:`35977`) Interval ^^^^^^^^ diff --git a/pandas/conftest.py b/pandas/conftest.py index 7b29c41ef70f5..f948dc11bc014 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1153,6 +1153,27 @@ def object_dtype(request): return request.param +@pytest.fixture( + params=[ + "object", + "string", + pytest.param( + "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ] +) +def any_string_dtype(request): + """ + Parametrized fixture for string dtypes. + * 'object' + * 'string' + * 'arrow_string' + """ + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + return request.param + + @pytest.fixture(params=tm.DATETIME64_DTYPES) def datetime64_dtype(request): """ diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 201b9fdcc51cc..2d3a168a31e1e 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -149,7 +149,7 @@ def re_replacer(s): else: return s - f = np.vectorize(re_replacer, otypes=[values.dtype]) + f = np.vectorize(re_replacer, otypes=[np.object_]) if mask is None: values[:] = f(values) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 92f9d803d1ebe..bd4dfdb4ebad0 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -49,6 +49,7 @@ is_extension_array_dtype, is_list_like, is_sparse, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -788,7 +789,7 @@ def _replace_list( src_len = len(pairs) - 1 - if values.dtype == _dtype_obj: + if is_string_dtype(values): # Calculate the mask once, prior to the call of comp # in order to avoid repeating the same computations mask = ~isna(values) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index e6ed60dc2bb08..3ffaf67c656d9 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -563,10 +563,11 @@ def test_regex_replace_dict_nested(self, mix_abc): tm.assert_frame_equal(res3, expec) tm.assert_frame_equal(res4, expec) - def test_regex_replace_dict_nested_non_first_character(self): + def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): # GH 25259 - df = DataFrame({"first": ["abc", "bca", "cab"]}) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) + dtype = any_string_dtype + df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) result = df.replace({"a": "."}, regex=True) tm.assert_frame_equal(result, expected) @@ -685,6 +686,24 @@ def test_replace_regex_metachar(self, metachar): expected = DataFrame({"a": ["paren", "else"]}) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "data,to_replace,expected", + [ + (["xax", "xbx"], {"a": "c", "b": "d"}, ["xcx", "xdx"]), + (["d", "", ""], {r"^\s*$": pd.NA}, ["d", pd.NA, pd.NA]), + ], + ) + def test_regex_replace_string_types( + self, data, to_replace, expected, frame_or_series, any_string_dtype + ): + # GH-41333, GH-35977 + dtype = any_string_dtype + obj = frame_or_series(data, dtype=dtype) + result = obj.replace(to_replace, regex=True) + expected = frame_or_series(expected, dtype=dtype) + + tm.assert_equal(result, expected) + def test_replace(self, datetime_frame): datetime_frame["A"][:5] = np.nan datetime_frame["A"][-5:] = np.nan diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 17703d970e29e..4fedbee91f649 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import Series from pandas.core import strings as strings @@ -175,24 +173,3 @@ def any_allowed_skipna_inferred_dtype(request): # correctness of inference tested in tests/dtypes/test_inference.py return inferred_dtype, values - - -@pytest.fixture( - params=[ - "object", - "string", - pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") - ), - ] -) -def any_string_dtype(request): - """ - Parametrized fixture for string dtypes. - * 'object' - * 'string' - * 'arrow_string' - """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - return request.param