Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arrow string array dtype #36142

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,8 @@ def __ne__(self, other: Any) -> ArrayLike:
"""
Return for `self != other` (element-wise in-equality).
"""
if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
return NotImplemented
return ~(self == other)

def to_numpy(
Expand Down Expand Up @@ -459,6 +461,7 @@ def astype(self, dtype, copy=True):
from pandas.core.arrays.string_ import StringDtype

dtype = pandas_dtype(dtype)
# FIXME: Really hard-code here?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs the general 'astype' machinery refactor to fix this, I think

if isinstance(dtype, StringDtype): # allow conversion to StringArrays
return dtype.construct_array_type()._from_sequence(self, copy=False)

Expand Down Expand Up @@ -924,9 +927,9 @@ def take(
from the right (the default). This is similar to
:func:`numpy.take`.

* True: negative values in `indices` indicate
missing values. These values are set to `fill_value`. Any other
other negative values raise a ``ValueError``.
* True: ``-1`` in `indices` indicate missing values.
These values are set to `fill_value`. Any other other negative
value raise a ``ValueError``.

fill_value : any, optional
Fill value to use for NA-indices when `allow_fill` is True.
Expand Down
90 changes: 82 additions & 8 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import operator
from typing import TYPE_CHECKING, Type, Union
from typing import TYPE_CHECKING, Any, Type, Union

import numpy as np

from pandas._config import get_option

from pandas._libs import lib, missing as libmissing

from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype
Expand Down Expand Up @@ -50,28 +52,99 @@ class StringDtype(ExtensionDtype):
StringDtype
"""

name = "string"

#: StringDtype.na_value uses pandas.NA
na_value = libmissing.NA
_metadata = ("storage",)

def __init__(self, storage=None):
if storage is None:
storage = get_option("mode.string_storage")
if storage not in {"python", "pyarrow"}:
raise ValueError(
f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
)
self.storage = storage

@property
def name(self):
return f"StringDtype[{self.storage}]"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return f"StringDtype[{self.storage}]"
return f"string[{self.storage}]"

?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, this seems better.


@property
def type(self) -> Type[str]:
return str

@classmethod
def construct_array_type(cls) -> Type["StringArray"]:
def construct_from_string(cls, string):
"""
Construct a StringDtype from a string.

Parameters
----------
string : str
The type of the name. The storage type will be taking from `string`.
Valid options and their storage types are

========================== ==============
string result storage
========================== ==============
``'string'`` global default
``'string[python]'`` python
``'StringDtype[python]'`` python
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why allow the full class name "StringDtype" as string as well? (I don't think we do that for other dtypes?)

EDIT: ah, I see you used that as name above. But so then: why not "string" as name?

``'string[pyarrow]'`` pyarrow
``'StringDtype[pyarrow]'`` pyarrow
========================== =============

Returns
-------
StringDtype

Raise
-----
TypeError
If the string is not a valid option.

"""
if not isinstance(string, str):
raise TypeError(
f"'construct_from_string' expects a string, got {type(string)}"
)
if string == "string":
# TODO: use global default
return cls()
elif string in {"string[python]", "StringDtype[python]"}:
return cls(storage="python")
elif string in {"string[pyarrow]", "StringDtype[pyarrow]"}:
return cls(storage="pyarrow")
else:
raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")

def __eq__(self, other: Any) -> bool:
if isinstance(other, str) and other == "string":
return True
return super().__eq__(other)

def __hash__(self) -> int:
# custom __eq__ so have to override __hash__
return super().__hash__()

# XXX: this is a classmethod, but we need to know the storage type.
def construct_array_type(self) -> Type["StringArray"]:
"""
Return the array type associated with this dtype.

Returns
-------
type
"""
return StringArray
from .string_arrow import ArrowStringArray

if self.storage == "python":
return StringArray
else:
return ArrowStringArray

def __repr__(self) -> str:
return "StringDtype"
def __repr__(self):
return self.name

def __from_arrow__(
self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]
Expand All @@ -80,6 +153,7 @@ def __from_arrow__(
Construct StringArray from pyarrow Array/ChunkedArray.
"""
import pyarrow # noqa: F811
from .string_arrow import ArrowStringArray

if isinstance(array, pyarrow.Array):
chunks = [array]
Expand All @@ -93,7 +167,7 @@ def __from_arrow__(
str_arr = StringArray._from_sequence(np.array(arr))
results.append(str_arr)

return StringArray._concat_same_type(results)
return ArrowStringArray._concat_same_type(results)


class StringArray(PandasArray):
Expand Down
Loading