From 4cb60e6b586338c468e04a4274a05c06811adeb7 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Fri, 10 Jul 2020 20:19:15 +0200
Subject: [PATCH 1/2] Implement BaseDtypeTests for ArrowStringDtype

---
 pandas/core/arrays/base.py                  |   6 +-
 pandas/core/arrays/string_arrow.py          | 484 ++++++++++++++++++++
 pandas/tests/extension/test_string_arrow.py | 125 +++++
 setup.py                                    |   2 +-
 4 files changed, 615 insertions(+), 2 deletions(-)
 create mode 100644 pandas/core/arrays/string_arrow.py
 create mode 100644 pandas/tests/extension/test_string_arrow.py

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 8193d65b3b30c..736d95b4b64b6 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -457,9 +457,13 @@ def astype(self, dtype, copy=True):
             NumPy ndarray with 'dtype' for its dtype.
         """
         from pandas.core.arrays.string_ import StringDtype
+        from pandas.core.arrays.string_arrow import ArrowStringDtype
 
         dtype = pandas_dtype(dtype)
-        if isinstance(dtype, StringDtype):  # allow conversion to StringArrays
+        # FIXME: Really hard-code here?
+        if isinstance(
+            dtype, (ArrowStringDtype, StringDtype)
+        ):  # allow conversion to StringArrays
             return dtype.construct_array_type()._from_sequence(self, copy=False)
 
         return np.array(self, dtype=dtype, copy=copy)
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
new file mode 100644
index 0000000000000..8248a3e91c0fe
--- /dev/null
+++ b/pandas/core/arrays/string_arrow.py
@@ -0,0 +1,484 @@
+from collections.abc import Iterable
+from typing import Any, Optional, Sequence, Tuple, Type, Union
+
+import numpy as np
+import pyarrow as pa
+import pyarrow.compute as pc
+
+from pandas._libs import missing as libmissing
+from pandas._typing import ArrayLike
+
+from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.dtypes import register_extension_dtype
+
+import pandas as pd
+from pandas.api.types import (
+    is_array_like,
+    is_bool_dtype,
+    is_integer,
+    is_integer_dtype,
+    is_scalar,
+)
+from pandas.core.arrays.base import ExtensionArray
+from pandas.core.indexers import check_array_indexer
+
+
+def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]:
+    scalar = arrow_scalar.as_py()
+    if scalar is None:
+        return libmissing.NA
+    else:
+        return scalar
+
+
+@register_extension_dtype
+class ArrowStringDtype(ExtensionDtype):
+    """
+    Extension dtype for string data in a ``pyarrow.ChunkedArray``.
+
+    .. versionadded:: 1.1.0
+
+    .. warning::
+
+       ArrowStringDtype is considered experimental. The implementation and
+       parts of the API may change without warning.
+
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
+    Examples
+    --------
+    >>> pd.ArrowStringDtype()
+    ArrowStringDtype
+    """
+
+    name = "arrow_string"
+
+    #: StringDtype.na_value uses pandas.NA
+    na_value = libmissing.NA
+
+    @property
+    def type(self) -> Type[str]:
+        return str
+
+    @classmethod
+    def construct_array_type(cls) -> Type["ArrowStringArray"]:
+        """
+        Return the array type associated with this dtype.
+
+        Returns
+        -------
+        type
+        """
+        return ArrowStringArray
+
+    def __hash__(self) -> int:
+        return hash("ArrowStringDtype")
+
+    def __repr__(self) -> str:
+        return "ArrowStringDtype"
+
+    def __from_arrow__(
+        self, array: Union["pa.Array", "pa.ChunkedArray"]
+    ) -> "ArrowStringArray":
+        """
+        Construct StringArray from pyarrow Array/ChunkedArray.
+        """
+        return ArrowStringArray(array)
+
+    def __eq__(self, other) -> bool:
+        """Check whether 'other' is equal to self.
+
+        By default, 'other' is considered equal if
+        * it's a string matching 'self.name'.
+        * it's an instance of this type.
+
+        Parameters
+        ----------
+        other : Any
+
+        Returns
+        -------
+        bool
+        """
+        if isinstance(other, ArrowStringDtype):
+            return True
+        elif isinstance(other, str) and other == "arrow_string":
+            return True
+        else:
+            return False
+
+
+class ArrowStringArray(ExtensionArray):
+    """
+    Extension array for string data in a ``pyarrow.ChunkedArray``.
+
+    .. versionadded:: 1.1.0
+
+    .. warning::
+
+       ArrowStringArray is considered experimental. The implementation and
+       parts of the API may change without warning.
+
+    Parameters
+    ----------
+    values : pyarrow.Array or pyarrow.ChunkedArray
+        The array of data.
+
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
+    See Also
+    --------
+    array
+        The recommended function for creating a ArrowStringArray.
+    Series.str
+        The string methods are available on Series backed by
+        a ArrowStringArray.
+
+    Notes
+    -----
+    ArrowStringArray returns a BooleanArray for comparison methods.
+
+    Examples
+    --------
+    >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string")
+    <ArrowStringArray>
+    ['This is', 'some text', <NA>, 'data.']
+    Length: 4, dtype: arrow_string
+    """
+
+    def __init__(self, values):
+        if isinstance(values, pa.Array):
+            self.data = pa.chunked_array([values])
+        elif isinstance(values, pa.ChunkedArray):
+            self.data = values
+        else:
+            raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray")
+
+    @classmethod
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
+        # TODO(ARROW-9407): Accept pd.NA in Arrow
+        scalars_corrected = [None if pd.isna(x) else x for x in scalars]
+        return cls(pa.array(scalars_corrected, type=pa.string()))
+
+    @property
+    def dtype(self) -> ArrowStringDtype:
+        """
+        An instance of 'ArrowStringDtype'.
+        """
+        return ArrowStringDtype()
+
+    def __array__(self, *args, **kwargs) -> "np.ndarray":
+        """Correctly construct numpy arrays when passed to `np.asarray()`."""
+        return self.data.__array__(*args, **kwargs)
+
+    def __arrow_array__(self, type=None):
+        """Convert myself to a pyarrow Array or ChunkedArray."""
+        return self.data
+
+    @property
+    def size(self) -> int:
+        """
+        Return the number of elements in this array.
+
+        Returns
+        -------
+        size : int
+        """
+        return len(self.data)
+
+    @property
+    def shape(self) -> Tuple[int]:
+        """Return the shape of the data."""
+        # This may be patched by pandas to support pseudo-2D operations.
+        return (len(self.data),)
+
+    @property
+    def ndim(self) -> int:
+        """Return the number of dimensions of the underlying data."""
+        return 1
+
+    def __len__(self) -> int:
+        """
+        Length of this array.
+
+        Returns
+        -------
+        length : int
+        """
+        return len(self.data)
+
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        return cls._from_sequence(strings, dtype=dtype, copy=copy)
+
+    def __getitem__(self, item):
+        # type (Any) -> Any
+        """Select a subset of self.
+
+        Parameters
+        ----------
+        item : int, slice, or ndarray
+            * int: The position in 'self' to get.
+            * slice: A slice object, where 'start', 'stop', and 'step' are
+              integers or None
+            * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
+
+        Returns
+        -------
+        item : scalar or ExtensionArray
+
+        Notes
+        -----
+        For scalar ``item``, return a scalar value suitable for the array's
+        type. This should be an instance of ``self.dtype.type``.
+        For slice ``key``, return an instance of ``ExtensionArray``, even
+        if the slice is length 0 or 1.
+        For a boolean mask, return an instance of ``ExtensionArray``, filtered
+        to the values where ``item`` is True.
+        """
+        item = check_array_indexer(self, item)
+
+        if isinstance(item, Iterable):
+            if not is_array_like(item):
+                item = np.array(item)
+            if len(item) == 0:
+                return type(self)(pa.chunked_array([], type=pa.string()))
+            elif is_integer_dtype(item):
+                return self.take(item)
+            elif is_bool_dtype(item):
+                return type(self)(self.data.filter(item))
+            else:
+                raise IndexError(
+                    "Only integers, slices and integer or "
+                    "boolean arrays are valid indices."
+                )
+        elif is_integer(item):
+            if item < 0:
+                item += len(self)
+            if item >= len(self):
+                raise IndexError("index out of bounds")
+
+        value = self.data[item]
+        if isinstance(value, pa.ChunkedArray):
+            return type(self)(value)
+        else:
+            return _as_pandas_scalar(value)
+
+    def fillna(self, value=None, method=None, limit=None):
+        raise NotImplementedError("fillna")
+
+    def _reduce(self, name, skipna=True, **kwargs):
+        if name in ["min", "max"]:
+            return getattr(self, name)(skipna=skipna)
+
+        raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
+
+    @property
+    def nbytes(self) -> int:
+        """
+        The number of bytes needed to store this object in memory.
+        """
+        return self.data.nbytes
+
+    def isna(self) -> np.ndarray:
+        """
+        Boolean NumPy array indicating if each value is missing.
+
+        This should return a 1-D array the same length as 'self'.
+        """
+        # TODO: Implement .to_numpy for ChunkedArray
+        return self.data.is_null().to_pandas().values
+
+    def copy(self) -> ExtensionArray:
+        """
+        Return a copy of the array.
+
+        Parameters
+        ----------
+        deep : bool, default False
+            Also copy the underlying data backing this array.
+
+        Returns
+        -------
+        ExtensionArray
+        """
+        return type(self)(self.data)
+
+    def __eq__(self, other: Any) -> ArrayLike:
+        """
+        Return for `self == other` (element-wise equality).
+        """
+        if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)):
+            return NotImplemented
+        if isinstance(other, ArrowStringArray):
+            result = pc.equal(self.data, other.data)
+        elif is_scalar(other):
+            result = pc.equal(self.data, pa.scalar(other))
+        else:
+            raise NotImplementedError("Neither scalar nor ArrowStringArray")
+
+        # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray
+        return pd.array(result.to_pandas().values)
+
+    def __setitem__(self, key, value):
+        # type: (Union[int, np.ndarray], Any) -> None
+        """Set one or more values inplace.
+
+        Parameters
+        ----------
+        key : int, ndarray, or slice
+            When called from, e.g. ``Series.__setitem__``, ``key`` will be
+            one of
+
+            * scalar int
+            * ndarray of integers.
+            * boolean ndarray
+            * slice object
+
+        value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
+            value or values to be set of ``key``.
+
+        Returns
+        -------
+        None
+        """
+        key = check_array_indexer(self, key)
+
+        if is_integer(key):
+            if not pd.api.types.is_scalar(value):
+                raise ValueError("Must pass scalars with scalar indexer")
+            elif pd.isna(value):
+                value = None
+            elif not isinstance(value, str):
+                raise ValueError("Scalar must be NA or str")
+
+            # Slice data and insert inbetween
+            new_data = [
+                *self.data[0:key].chunks,
+                pa.array([value], type=pa.string()),
+                *self.data[(key + 1) :].chunks,
+            ]
+            self.data = pa.chunked_array(new_data)
+        else:
+            # Convert to integer indices and iteratively assign.
+            # TODO: Make a faster variant of this in Arrow upstream.
+            #       This is probably extremely slow.
+
+            # Convert all possible input key types to an array of integers
+            if is_bool_dtype(key):
+                # TODO(ARROW-9430): Directly support setitem(booleans)
+                key_array = np.argwhere(key).flatten()
+            elif isinstance(key, slice):
+                key_array = np.array(range(len(self))[key])
+            else:
+                # TODO(ARROW-9431): Directly support setitem(integers)
+                key_array = np.asanyarray(key)
+
+            if pd.api.types.is_scalar(value):
+                value = np.broadcast_to(value, len(key_array))
+            else:
+                value = np.asarray(value)
+
+            if len(key_array) != len(value):
+                raise ValueError("Length of indexer and values mismatch")
+
+            for k, v in zip(key_array, value):
+                self[k] = v
+
+    def take(
+        self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None
+    ) -> "ExtensionArray":
+        """
+        Take elements from an array.
+
+        Parameters
+        ----------
+        indices : sequence of int
+            Indices to be taken.
+        allow_fill : bool, default False
+            How to handle negative values in `indices`.
+
+            * False: negative values in `indices` indicate positional indices
+              from the right (the default). This is similar to
+              :func:`numpy.take`.
+
+            * True: negative values in `indices` indicate
+              missing values. These values are set to `fill_value`. Any other
+              other negative values raise a ``ValueError``.
+
+        fill_value : any, optional
+            Fill value to use for NA-indices when `allow_fill` is True.
+            This may be ``None``, in which case the default NA value for
+            the type, ``self.dtype.na_value``, is used.
+
+            For many ExtensionArrays, there will be two representations of
+            `fill_value`: a user-facing "boxed" scalar, and a low-level
+            physical NA value. `fill_value` should be the user-facing version,
+            and the implementation should handle translating that to the
+            physical version for processing the take if necessary.
+
+        Returns
+        -------
+        ExtensionArray
+
+        Raises
+        ------
+        IndexError
+            When the indices are out of bounds for the array.
+        ValueError
+            When `indices` contains negative values other than ``-1``
+            and `allow_fill` is True.
+
+        See Also
+        --------
+        numpy.take
+        api.extensions.take
+
+        Notes
+        -----
+        ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
+        ``iloc``, when `indices` is a sequence of values. Additionally,
+        it's called by :meth:`Series.reindex`, or any other method
+        that causes realignment, with a `fill_value`.
+        """
+        # TODO: Remove once we got rid of the (indices < 0) check
+        if not is_array_like(indices):
+            indices_array = np.asanyarray(indices)
+        else:
+            indices_array = indices
+
+        if len(self.data) == 0 and (indices_array >= 0).any():
+            raise IndexError("cannot do a non-empty take")
+        if indices_array.max() >= len(self.data):
+            raise IndexError("out of bounds value in 'indices'.")
+
+        if allow_fill:
+            if (indices_array < 0).any():
+                # TODO(ARROW-9433): Treat negative indices as NULL
+                indices_array = pa.array(indices_array, mask=indices_array < 0)
+                result = self.data.take(indices_array)
+                if pd.isna(fill_value):
+                    return type(self)(result)
+                return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
+            else:
+                # Nothing to fill
+                return type(self)(self.data.take(indices))
+        else:  # allow_fill=False
+            # TODO(ARROW-9432): Treat negative indices as indices from the right.
+            if (indices_array < 0).any():
+                # Don't modify in-place
+                indices_array = np.copy(indices_array)
+                indices_array[indices_array < 0] += len(self.data)
+            return type(self)(self.data.take(indices_array))
diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py
new file mode 100644
index 0000000000000..437d51060fb7f
--- /dev/null
+++ b/pandas/tests/extension/test_string_arrow.py
@@ -0,0 +1,125 @@
+import string
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype
+from pandas.tests.extension import base
+
+
+@pytest.fixture
+def dtype():
+    return ArrowStringDtype()
+
+
+@pytest.fixture
+def data():
+    strings = np.random.choice(list(string.ascii_letters), size=100)
+    while strings[0] == strings[1]:
+        strings = np.random.choice(list(string.ascii_letters), size=100)
+
+    return ArrowStringArray._from_sequence(strings)
+
+
+@pytest.fixture
+def data_missing():
+    """Length 2 array with [NA, Valid]"""
+    return ArrowStringArray._from_sequence([pd.NA, "A"])
+
+
+@pytest.fixture
+def data_for_sorting():
+    return ArrowStringArray._from_sequence(["B", "C", "A"])
+
+
+@pytest.fixture
+def data_missing_for_sorting():
+    return ArrowStringArray._from_sequence(["B", pd.NA, "A"])
+
+
+@pytest.fixture
+def na_value():
+    return pd.NA
+
+
+@pytest.fixture
+def data_for_grouping():
+    return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"])
+
+
+class TestDtype(base.BaseDtypeTests):
+    pass
+
+
+class TestInterface(base.BaseInterfaceTests):
+    @pytest.mark.xfail(reason="Fails until implement, remove before merge")
+    def test_view(self, data):
+        base.BaseInterfaceTests.test_view(self, data)
+
+
+class TestConstructors(base.BaseConstructorsTests):
+    pass
+
+
+#  class TestReshaping(base.BaseReshapingTests):
+#     pass
+
+
+class TestGetitem(base.BaseGetitemTests):
+    pass
+
+
+class TestSetitem(base.BaseSetitemTests):
+    pass
+
+
+# class TestMissing(base.BaseMissingTests):
+#     pass
+
+
+# class TestNoReduce(base.BaseNoReduceTests):
+#     @pytest.mark.parametrize("skipna", [True, False])
+#     def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
+#         op_name = all_numeric_reductions
+#
+#         if op_name in ["min", "max"]:
+#             return None
+#
+#         s = pd.Series(data)
+#         with pytest.raises(TypeError):
+#             getattr(s, op_name)(skipna=skipna)
+
+
+# class TestMethods(base.BaseMethodsTests):
+#     @pytest.mark.skip(reason="returns nullable")
+#     def test_value_counts(self, all_data, dropna):
+#         return super().test_value_counts(all_data, dropna)
+
+
+# class TestCasting(base.BaseCastingTests):
+#     pass
+
+
+# class TestComparisonOps(base.BaseComparisonOpsTests):
+#     def _compare_other(self, s, data, op_name, other):
+#         result = getattr(s, op_name)(other)
+#         expected = getattr(s.astype(object), op_name)(other).astype("boolean")
+#         self.assert_series_equal(result, expected)
+
+#     def test_compare_scalar(self, data, all_compare_operators):
+#         op_name = all_compare_operators
+#         s = pd.Series(data)
+#         self._compare_other(s, data, op_name, "abc")
+
+
+# class TestParsing(base.BaseParsingTests):
+#     pass
+
+
+# class TestPrinting(base.BasePrintingTests):
+#     pass
+
+
+# class TestGroupBy(base.BaseGroupbyTests):
+#     pass
diff --git a/setup.py b/setup.py
index f6f0cd9aabc0e..4033ea2935de5 100755
--- a/setup.py
+++ b/setup.py
@@ -432,7 +432,7 @@ def run(self):
         extra_compile_args.append("/Z7")
         extra_link_args.append("/DEBUG")
 else:
-    extra_compile_args = ["-Werror"]
+    extra_compile_args = []
     extra_link_args = []
     if debugging_symbols_requested:
         extra_compile_args.append("-g")

From d242f2d0bc2d0eae9481ce2fa09969d9eb20113c Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 3 Sep 2020 15:32:45 -0500
Subject: [PATCH 2/2] Refactor to use parametrized StringDtype

---
 pandas/core/arrays/base.py                    |  13 +-
 pandas/core/arrays/string_.py                 |  90 +++++++++-
 pandas/core/arrays/string_arrow.py            | 166 +++++++-----------
 pandas/core/config_init.py                    |  13 ++
 pandas/core/strings.py                        |  10 +-
 .../tests/arrays/string_/test_string_arrow.py |  26 +++
 pandas/tests/extension/arrow/test_string.py   |   7 +-
 pandas/tests/extension/test_string_arrow.py   | 103 +++++++----
 setup.py                                      |   2 +-
 9 files changed, 261 insertions(+), 169 deletions(-)
 create mode 100644 pandas/tests/arrays/string_/test_string_arrow.py

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 736d95b4b64b6..9b1b2c0d74e3f 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -356,6 +356,8 @@ def __ne__(self, other: Any) -> ArrayLike:
         """
         Return for `self != other` (element-wise in-equality).
         """
+        if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
+            return NotImplemented
         return ~(self == other)
 
     def to_numpy(
@@ -457,13 +459,10 @@ def astype(self, dtype, copy=True):
             NumPy ndarray with 'dtype' for its dtype.
         """
         from pandas.core.arrays.string_ import StringDtype
-        from pandas.core.arrays.string_arrow import ArrowStringDtype
 
         dtype = pandas_dtype(dtype)
         # FIXME: Really hard-code here?
-        if isinstance(
-            dtype, (ArrowStringDtype, StringDtype)
-        ):  # allow conversion to StringArrays
+        if isinstance(dtype, StringDtype):  # allow conversion to StringArrays
             return dtype.construct_array_type()._from_sequence(self, copy=False)
 
         return np.array(self, dtype=dtype, copy=copy)
@@ -928,9 +927,9 @@ def take(
               from the right (the default). This is similar to
               :func:`numpy.take`.
 
-            * True: negative values in `indices` indicate
-              missing values. These values are set to `fill_value`. Any other
-              other negative values raise a ``ValueError``.
+            * True: ``-1`` in `indices` indicate missing values.
+              These values are set to `fill_value`. Any other other negative
+              value raise a ``ValueError``.
 
         fill_value : any, optional
             Fill value to use for NA-indices when `allow_fill` is True.
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 381968f9724b6..0e7c5a8036bcf 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -1,8 +1,10 @@
 import operator
-from typing import TYPE_CHECKING, Type, Union
+from typing import TYPE_CHECKING, Any, Type, Union
 
 import numpy as np
 
+from pandas._config import get_option
+
 from pandas._libs import lib, missing as libmissing
 
 from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype
@@ -50,17 +52,83 @@ class StringDtype(ExtensionDtype):
     StringDtype
     """
 
-    name = "string"
-
     #: StringDtype.na_value uses pandas.NA
     na_value = libmissing.NA
+    _metadata = ("storage",)
+
+    def __init__(self, storage=None):
+        if storage is None:
+            storage = get_option("mode.string_storage")
+        if storage not in {"python", "pyarrow"}:
+            raise ValueError(
+                f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
+            )
+        self.storage = storage
+
+    @property
+    def name(self):
+        return f"StringDtype[{self.storage}]"
 
     @property
     def type(self) -> Type[str]:
         return str
 
     @classmethod
-    def construct_array_type(cls) -> Type["StringArray"]:
+    def construct_from_string(cls, string):
+        """
+        Construct a StringDtype from a string.
+
+        Parameters
+        ----------
+        string : str
+            The type of the name. The storage type will be taking from `string`.
+            Valid options and their storage types are
+
+            ========================== ==============
+            string                     result storage
+            ========================== ==============
+            ``'string'``               global default
+            ``'string[python]'``       python
+            ``'StringDtype[python]'``  python
+            ``'string[pyarrow]'``      pyarrow
+            ``'StringDtype[pyarrow]'`` pyarrow
+            ========================== =============
+
+        Returns
+        -------
+        StringDtype
+
+        Raise
+        -----
+        TypeError
+            If the string is not a valid option.
+
+        """
+        if not isinstance(string, str):
+            raise TypeError(
+                f"'construct_from_string' expects a string, got {type(string)}"
+            )
+        if string == "string":
+            # TODO: use global default
+            return cls()
+        elif string in {"string[python]", "StringDtype[python]"}:
+            return cls(storage="python")
+        elif string in {"string[pyarrow]", "StringDtype[pyarrow]"}:
+            return cls(storage="pyarrow")
+        else:
+            raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
+
+    def __eq__(self, other: Any) -> bool:
+        if isinstance(other, str) and other == "string":
+            return True
+        return super().__eq__(other)
+
+    def __hash__(self) -> int:
+        # custom __eq__ so have to override __hash__
+        return super().__hash__()
+
+    # XXX: this is a classmethod, but we need to know the storage type.
+    def construct_array_type(self) -> Type["StringArray"]:
         """
         Return the array type associated with this dtype.
 
@@ -68,10 +136,15 @@ def construct_array_type(cls) -> Type["StringArray"]:
         -------
         type
         """
-        return StringArray
+        from .string_arrow import ArrowStringArray
+
+        if self.storage == "python":
+            return StringArray
+        else:
+            return ArrowStringArray
 
-    def __repr__(self) -> str:
-        return "StringDtype"
+    def __repr__(self):
+        return self.name
 
     def __from_arrow__(
         self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]
@@ -80,6 +153,7 @@ def __from_arrow__(
         Construct StringArray from pyarrow Array/ChunkedArray.
         """
         import pyarrow  # noqa: F811
+        from .string_arrow import ArrowStringArray
 
         if isinstance(array, pyarrow.Array):
             chunks = [array]
@@ -93,7 +167,7 @@ def __from_arrow__(
             str_arr = StringArray._from_sequence(np.array(arr))
             results.append(str_arr)
 
-        return StringArray._concat_same_type(results)
+        return ArrowStringArray._concat_same_type(results)
 
 
 class StringArray(PandasArray):
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 8248a3e91c0fe..c0831a65b3644 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,5 +1,5 @@
 from collections.abc import Iterable
-from typing import Any, Optional, Sequence, Tuple, Type, Union
+from typing import Any, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import pyarrow as pa
@@ -8,18 +8,19 @@
 from pandas._libs import missing as libmissing
 from pandas._typing import ArrayLike
 
-from pandas.core.dtypes.base import ExtensionDtype
-from pandas.core.dtypes.dtypes import register_extension_dtype
+from pandas.core.dtypes.missing import isna
 
-import pandas as pd
 from pandas.api.types import (
     is_array_like,
     is_bool_dtype,
+    is_int64_dtype,
     is_integer,
     is_integer_dtype,
     is_scalar,
 )
+from pandas.core.algorithms import factorize
 from pandas.core.arrays.base import ExtensionArray
+from pandas.core.arrays.string_ import StringDtype
 from pandas.core.indexers import check_array_indexer
 
 
@@ -31,89 +32,6 @@ def _as_pandas_scalar(arrow_scalar: pa.Scalar) -> Optional[str]:
         return scalar
 
 
-@register_extension_dtype
-class ArrowStringDtype(ExtensionDtype):
-    """
-    Extension dtype for string data in a ``pyarrow.ChunkedArray``.
-
-    .. versionadded:: 1.1.0
-
-    .. warning::
-
-       ArrowStringDtype is considered experimental. The implementation and
-       parts of the API may change without warning.
-
-    Attributes
-    ----------
-    None
-
-    Methods
-    -------
-    None
-
-    Examples
-    --------
-    >>> pd.ArrowStringDtype()
-    ArrowStringDtype
-    """
-
-    name = "arrow_string"
-
-    #: StringDtype.na_value uses pandas.NA
-    na_value = libmissing.NA
-
-    @property
-    def type(self) -> Type[str]:
-        return str
-
-    @classmethod
-    def construct_array_type(cls) -> Type["ArrowStringArray"]:
-        """
-        Return the array type associated with this dtype.
-
-        Returns
-        -------
-        type
-        """
-        return ArrowStringArray
-
-    def __hash__(self) -> int:
-        return hash("ArrowStringDtype")
-
-    def __repr__(self) -> str:
-        return "ArrowStringDtype"
-
-    def __from_arrow__(
-        self, array: Union["pa.Array", "pa.ChunkedArray"]
-    ) -> "ArrowStringArray":
-        """
-        Construct StringArray from pyarrow Array/ChunkedArray.
-        """
-        return ArrowStringArray(array)
-
-    def __eq__(self, other) -> bool:
-        """Check whether 'other' is equal to self.
-
-        By default, 'other' is considered equal if
-        * it's a string matching 'self.name'.
-        * it's an instance of this type.
-
-        Parameters
-        ----------
-        other : Any
-
-        Returns
-        -------
-        bool
-        """
-        if isinstance(other, ArrowStringDtype):
-            return True
-        elif isinstance(other, str) and other == "arrow_string":
-            return True
-        else:
-            return False
-
-
 class ArrowStringArray(ExtensionArray):
     """
     Extension array for string data in a ``pyarrow.ChunkedArray``.
@@ -165,19 +83,20 @@ def __init__(self, values):
             self.data = values
         else:
             raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray")
+        self._dtype = StringDtype(storage="pyarrow")
 
     @classmethod
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         # TODO(ARROW-9407): Accept pd.NA in Arrow
-        scalars_corrected = [None if pd.isna(x) else x for x in scalars]
+        scalars_corrected = [None if isna(x) else x for x in scalars]
         return cls(pa.array(scalars_corrected, type=pa.string()))
 
     @property
-    def dtype(self) -> ArrowStringDtype:
+    def dtype(self) -> StringDtype:
         """
-        An instance of 'ArrowStringDtype'.
+        An instance of 'StringDtype'.
         """
-        return ArrowStringDtype()
+        return self._dtype
 
     def __array__(self, *args, **kwargs) -> "np.ndarray":
         """Correctly construct numpy arrays when passed to `np.asarray()`."""
@@ -276,15 +195,6 @@ def __getitem__(self, item):
         else:
             return _as_pandas_scalar(value)
 
-    def fillna(self, value=None, method=None, limit=None):
-        raise NotImplementedError("fillna")
-
-    def _reduce(self, name, skipna=True, **kwargs):
-        if name in ["min", "max"]:
-            return getattr(self, name)(skipna=skipna)
-
-        raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
-
     @property
     def nbytes(self) -> int:
         """
@@ -320,7 +230,9 @@ def __eq__(self, other: Any) -> ArrayLike:
         """
         Return for `self == other` (element-wise equality).
         """
-        if isinstance(other, (pd.Series, pd.DataFrame, pd.Index)):
+        from pandas import array, Series, DataFrame, Index
+
+        if isinstance(other, (Series, DataFrame, Index)):
             return NotImplemented
         if isinstance(other, ArrowStringArray):
             result = pc.equal(self.data, other.data)
@@ -330,7 +242,7 @@ def __eq__(self, other: Any) -> ArrayLike:
             raise NotImplementedError("Neither scalar nor ArrowStringArray")
 
         # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray
-        return pd.array(result.to_pandas().values)
+        return array(result.to_pandas().values, dtype="boolean")
 
     def __setitem__(self, key, value):
         # type: (Union[int, np.ndarray], Any) -> None
@@ -357,9 +269,9 @@ def __setitem__(self, key, value):
         key = check_array_indexer(self, key)
 
         if is_integer(key):
-            if not pd.api.types.is_scalar(value):
+            if not is_scalar(value):
                 raise ValueError("Must pass scalars with scalar indexer")
-            elif pd.isna(value):
+            elif isna(value):
                 value = None
             elif not isinstance(value, str):
                 raise ValueError("Scalar must be NA or str")
@@ -386,7 +298,7 @@ def __setitem__(self, key, value):
                 # TODO(ARROW-9431): Directly support setitem(integers)
                 key_array = np.asanyarray(key)
 
-            if pd.api.types.is_scalar(value):
+            if is_scalar(value):
                 value = np.broadcast_to(value, len(key_array))
             else:
                 value = np.asarray(value)
@@ -461,15 +373,20 @@ def take(
 
         if len(self.data) == 0 and (indices_array >= 0).any():
             raise IndexError("cannot do a non-empty take")
-        if indices_array.max() >= len(self.data):
+        if len(indices_array) > 0 and indices_array.max() >= len(self.data):
             raise IndexError("out of bounds value in 'indices'.")
 
         if allow_fill:
             if (indices_array < 0).any():
+                if indices_array.min() < -1:
+                    raise ValueError(
+                        "'indicies' contains negative values other "
+                        "-1 with 'allow_fill=True."
+                    )
                 # TODO(ARROW-9433): Treat negative indices as NULL
                 indices_array = pa.array(indices_array, mask=indices_array < 0)
                 result = self.data.take(indices_array)
-                if pd.isna(fill_value):
+                if isna(fill_value):
                     return type(self)(result)
                 return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
             else:
@@ -482,3 +399,38 @@ def take(
                 indices_array = np.copy(indices_array)
                 indices_array[indices_array < 0] += len(self.data)
             return type(self)(self.data.take(indices_array))
+
+    def value_counts(self, dropna=True):
+        from pandas import Series
+
+        if dropna:
+            na = self.isna()
+            self = self[~na]
+        counts = self.data.value_counts()
+        return Series(counts.field(1), counts.field(0))
+
+    def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]:
+        # see https://github.com/xhochy/fletcher/blob/master/fletcher/base.py
+        # doesn't handle dictionary types.
+        if self.data.num_chunks == 1:
+            encoded = self.data.chunk(0).dictionary_encode()
+            indices = encoded.indices.to_pandas()
+            if indices.dtype.kind == "f":
+                indices[np.isnan(indices)] = na_sentinel
+                indices = indices.astype(int)
+            if not is_int64_dtype(indices):
+                indices = indices.astype(np.int64)
+            return indices.values, type(self)(encoded.dictionary)
+        else:
+            np_array = self.data.to_pandas().values
+            return factorize(np_array, na_sentinel=na_sentinel)
+
+    @classmethod
+    def _concat_same_type(
+        cls, to_concat: Sequence["ArrowStringArray"]
+    ) -> "ArrowStringArray":
+        return cls(
+            pa.chunked_array(
+                [array for ea in to_concat for array in ea.data.iterchunks()]
+            )
+        )
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 0c23f1b4bcdf2..a58e6eccf7644 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -504,6 +504,19 @@ def use_inf_as_na_cb(key):
     )
 
 
+string_storage_doc = """
+: string
+    The default storage for StringDtype.
+"""
+
+with cf.config_prefix("mode"):
+    cf.register_option(
+        "string_storage",
+        "python",
+        string_storage_doc,
+        validator=is_one_of_factory(["python", "pyarrow"]),
+    )
+
 # Set up the io.excel specific reader configuration.
 reader_engine_doc = """
 : string
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 6702bf519c52e..59aa8fc5cfa0e 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -901,8 +901,10 @@ def _result_dtype(arr):
     # workaround #27953
     # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
     # when the list of values is empty.
-    if arr.dtype.name == "string":
-        return "string"
+    from pandas.core.arrays.string_ import StringDtype
+
+    if isinstance(arr.dtype.name, StringDtype):
+        return arr.dtype.name
     else:
         return object
 
@@ -2097,9 +2099,11 @@ class StringMethods(NoNewAttributesMixin):
     """
 
     def __init__(self, data):
+        from pandas.core.arrays.string_ import StringDtype
+
         self._inferred_dtype = self._validate(data)
         self._is_categorical = is_categorical_dtype(data.dtype)
-        self._is_string = data.dtype.name == "string"
+        self._is_string = isinstance(data.dtype, StringDtype)
 
         # ._values.categories works for both Series/Index
         self._parent = data._values.categories if self._is_categorical else data
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
new file mode 100644
index 0000000000000..40e3f21670ea0
--- /dev/null
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -0,0 +1,26 @@
+import pytest
+
+import pandas as pd
+import pandas.testing as tm
+
+
+def test_eq_all_na():
+    a = pd.array([pd.NA, pd.NA], dtype=pd.StringDtype("pyarrow"))
+    result = a == a
+    expected = pd.array([pd.NA, pd.NA], dtype="boolean")
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_config():
+    # python by default
+    assert pd.StringDtype().storage == "python"
+    arr = pd.array(["a", "b"])
+    assert arr.dtype.storage == "python"
+
+    with pd.option_context("mode.string_storage", "pyarrow"):
+        assert pd.StringDtype().storage == "pyarrow"
+        arr = pd.array(["a", "b"])
+        assert arr.dtype.storage == "pyarrow"
+
+    with pytest.raises(ValueError):
+        pd.options.mode.string_storage = "foo"
diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py
index abd5c1f386dc5..f32f1e415ddc7 100644
--- a/pandas/tests/extension/arrow/test_string.py
+++ b/pandas/tests/extension/arrow/test_string.py
@@ -4,10 +4,9 @@
 
 pytest.importorskip("pyarrow", minversion="0.13.0")
 
-from .arrays import ArrowStringDtype  # isort:skip
-
 
 def test_constructor_from_list():
     # GH 27673
-    result = pd.Series(["E"], dtype=ArrowStringDtype())
-    assert isinstance(result.dtype, ArrowStringDtype)
+    result = pd.Series(["E"], dtype=pd.StringDtype(storage="pyarrow"))
+    assert isinstance(result.dtype, pd.StringDtype)
+    assert result.dtype.storage == "pyarrow"
diff --git a/pandas/tests/extension/test_string_arrow.py b/pandas/tests/extension/test_string_arrow.py
index 437d51060fb7f..848e8a435b530 100644
--- a/pandas/tests/extension/test_string_arrow.py
+++ b/pandas/tests/extension/test_string_arrow.py
@@ -4,13 +4,13 @@
 import pytest
 
 import pandas as pd
-from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype
+from pandas.core.arrays.string_arrow import ArrowStringArray
 from pandas.tests.extension import base
 
 
 @pytest.fixture
 def dtype():
-    return ArrowStringDtype()
+    return pd.StringDtype(storage="pyarrow")
 
 
 @pytest.fixture
@@ -62,64 +62,89 @@ class TestConstructors(base.BaseConstructorsTests):
     pass
 
 
-#  class TestReshaping(base.BaseReshapingTests):
-#     pass
+class TestReshaping(base.BaseReshapingTests):
+    pass
 
 
 class TestGetitem(base.BaseGetitemTests):
-    pass
+    @pytest.mark.xfail(
+        reason="pyarrow.lib.ArrowNotImplementedError: Function "
+        "fill_null has no kernel matching input types "
+        "(array[string], scalar[string])"
+    )
+    def test_take_non_na_fill_value(self, data_missing):
+        super().test_take_non_na_fill_value(data_missing)
+
+    @pytest.mark.xfail(
+        reason="pyarrow.lib.ArrowNotImplementedError: Function fill_null has no "
+        "kernel matching input types (array[string], scalar[string])"
+    )
+    def test_reindex_non_na_fill_value(self, data_missing):
+        super().test_reindex_non_na_fill_value(self, data_missing)
 
 
 class TestSetitem(base.BaseSetitemTests):
+    @pytest.mark.xfail(reason="TODO")
+    def test_setitem_preserves_views(self, data):
+        # Unclear where the issue is (pyarrow getitem, our getitem, our slice)
+        # and what to do here.
+        super().test_setitem_preserves_views(data)
+
+
+class TestMissing(base.BaseMissingTests):
     pass
 
 
-# class TestMissing(base.BaseMissingTests):
-#     pass
+class TestNoReduce(base.BaseNoReduceTests):
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
+        op_name = all_numeric_reductions
 
+        if op_name in ["min", "max"]:
+            return None
 
-# class TestNoReduce(base.BaseNoReduceTests):
-#     @pytest.mark.parametrize("skipna", [True, False])
-#     def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
-#         op_name = all_numeric_reductions
-#
-#         if op_name in ["min", "max"]:
-#             return None
-#
-#         s = pd.Series(data)
-#         with pytest.raises(TypeError):
-#             getattr(s, op_name)(skipna=skipna)
+        s = pd.Series(data)
+        with pytest.raises(TypeError):
+            getattr(s, op_name)(skipna=skipna)
 
 
-# class TestMethods(base.BaseMethodsTests):
-#     @pytest.mark.skip(reason="returns nullable")
-#     def test_value_counts(self, all_data, dropna):
-#         return super().test_value_counts(all_data, dropna)
+class TestMethods(base.BaseMethodsTests):
+    @pytest.mark.skip(reason="returns nullable")
+    def test_value_counts(self, all_data, dropna):
+        return super().test_value_counts(all_data, dropna)
 
 
-# class TestCasting(base.BaseCastingTests):
-#     pass
+class TestCasting(base.BaseCastingTests):
+    pass
 
 
-# class TestComparisonOps(base.BaseComparisonOpsTests):
-#     def _compare_other(self, s, data, op_name, other):
-#         result = getattr(s, op_name)(other)
-#         expected = getattr(s.astype(object), op_name)(other).astype("boolean")
-#         self.assert_series_equal(result, expected)
+class TestComparisonOps(base.BaseComparisonOpsTests):
+    def _compare_other(self, s, data, op_name, other):
+        if op_name not in {"__eq__", "__ne__"}:
+            pytest.skip(f"{op_name} is not implemented.")
+        result = getattr(s, op_name)(other)
+        expected = getattr(s.astype(object), op_name)(other).astype("boolean")
+        self.assert_series_equal(result, expected)
 
-#     def test_compare_scalar(self, data, all_compare_operators):
-#         op_name = all_compare_operators
-#         s = pd.Series(data)
-#         self._compare_other(s, data, op_name, "abc")
+    def test_compare_scalar(self, data, all_compare_operators):
+        op_name = all_compare_operators
+        s = pd.Series(data)
+        self._compare_other(s, data, op_name, "abc")
 
+    def test_compare_array(self, data, all_compare_operators):
+        op_name = all_compare_operators
+        s = pd.Series(data)
+        other = pd.Series([data[0]] * len(data), dtype=data.dtype)
+        self._compare_other(s, data, op_name, other)
 
-# class TestParsing(base.BaseParsingTests):
-#     pass
+
+class TestParsing(base.BaseParsingTests):
+    pass
 
 
-# class TestPrinting(base.BasePrintingTests):
-#     pass
+class TestPrinting(base.BasePrintingTests):
+    pass
 
 
-# class TestGroupBy(base.BaseGroupbyTests):
-#     pass
+class TestGroupBy(base.BaseGroupbyTests):
+    pass
diff --git a/setup.py b/setup.py
index 4033ea2935de5..f6f0cd9aabc0e 100755
--- a/setup.py
+++ b/setup.py
@@ -432,7 +432,7 @@ def run(self):
         extra_compile_args.append("/Z7")
         extra_link_args.append("/DEBUG")
 else:
-    extra_compile_args = []
+    extra_compile_args = ["-Werror"]
     extra_link_args = []
     if debugging_symbols_requested:
         extra_compile_args.append("-g")