From 884b606dc9c4a8b2534aa44cab69e8d5ff709265 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 21 Apr 2023 21:26:57 -0400 Subject: [PATCH 1/3] arrow duration overflowing on arithmetic ops --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/arrays/arrow/array.py | 8 +++++++- pandas/tests/extension/test_arrow.py | 11 +++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 5c9276d0fbf8a..24872bd8fce45 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -41,6 +41,7 @@ Bug fixes - Bug in logical and comparison operations between :class:`ArrowDtype` and numpy masked types (e.g. ``"boolean"``) (:issue:`52625`) - Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`) - Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) +- Bug in :class:`~arrays.ArrowExtensionArray` with duration dtype overflowing when constructed from data containing numpy ``NaT`` (:issue:`#####`) .. --------------------------------------------------------------------------- .. _whatsnew_201.other: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d83bf9d340993..3373e282cb595 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -258,7 +258,13 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal scalars = pa.array(scalars, from_pandas=True) if pa_dtype and scalars.type != pa_dtype: scalars = scalars.cast(pa_dtype) - return cls(scalars) + arr = cls(scalars) + if pa.types.is_duration(scalars.type) and scalars.null_count > 0: + # GH#####: upstream bug for duration types when originally + # constructed with data containing numpy NaT. + # https://github.com/apache/arrow/issues/35088 + arr = arr.fillna(arr.dtype.na_value) + return arr @classmethod def _from_sequence_of_strings( diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9191560bd9a68..3fbf8c618ed53 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2830,3 +2830,14 @@ def test_date32_repr(): arrow_dt = pa.array([date.fromisoformat("2020-01-01")], type=pa.date32()) ser = pd.Series(arrow_dt, dtype=ArrowDtype(arrow_dt.type)) assert repr(ser) == "0 2020-01-01\ndtype: date32[day][pyarrow]" + + +def test_duration_overflow_from_ndarray_containing_nat(): + # GH#### + data_ts = pd.to_datetime([1, None]) + data_td = pd.to_timedelta([1, None]) + ser_ts = pd.Series(data_ts, dtype=ArrowDtype(pa.timestamp("ns"))) + ser_td = pd.Series(data_td, dtype=ArrowDtype(pa.duration("ns"))) + result = ser_ts + ser_td + expected = pd.Series([2, None], dtype=ArrowDtype(pa.timestamp("ns"))) + tm.assert_series_equal(result, expected) From 4fe8b3c77dac27f128620791c282b3569668328e Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 21 Apr 2023 21:38:58 -0400 Subject: [PATCH 2/3] gh refs --- doc/source/whatsnew/v2.0.1.rst | 2 +- pandas/core/arrays/arrow/array.py | 2 +- pandas/tests/extension/test_arrow.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 24872bd8fce45..223721c3178f7 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -27,6 +27,7 @@ Bug fixes ~~~~~~~~~ - Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`) - Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`) +- Bug in :class:`~arrays.ArrowExtensionArray` with duration dtype overflowing when constructed from data containing numpy ``NaT`` (:issue:`52843`) - Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on large string dtypes (:issue:`52795`) @@ -41,7 +42,6 @@ Bug fixes - Bug in logical and comparison operations between :class:`ArrowDtype` and numpy masked types (e.g. ``"boolean"``) (:issue:`52625`) - Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`) - Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) -- Bug in :class:`~arrays.ArrowExtensionArray` with duration dtype overflowing when constructed from data containing numpy ``NaT`` (:issue:`#####`) .. --------------------------------------------------------------------------- .. _whatsnew_201.other: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3373e282cb595..51d6fa74ea94e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -260,7 +260,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal scalars = scalars.cast(pa_dtype) arr = cls(scalars) if pa.types.is_duration(scalars.type) and scalars.null_count > 0: - # GH#####: upstream bug for duration types when originally + # GH52843: upstream bug for duration types when originally # constructed with data containing numpy NaT. # https://github.com/apache/arrow/issues/35088 arr = arr.fillna(arr.dtype.na_value) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 3fbf8c618ed53..5a7c92ff5e768 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2833,7 +2833,7 @@ def test_date32_repr(): def test_duration_overflow_from_ndarray_containing_nat(): - # GH#### + # GH52843 data_ts = pd.to_datetime([1, None]) data_td = pd.to_timedelta([1, None]) ser_ts = pd.Series(data_ts, dtype=ArrowDtype(pa.timestamp("ns"))) From 094fd4f0c0147a7db1bd1eb324e856042d9bcc95 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 22 Apr 2023 06:38:52 -0400 Subject: [PATCH 3/3] min versions --- pandas/tests/extension/test_arrow.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5a7c92ff5e768..0300b271acc3f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2832,6 +2832,11 @@ def test_date32_repr(): assert repr(ser) == "0 2020-01-01\ndtype: date32[day][pyarrow]" +@pytest.mark.xfail( + pa_version_under8p0, + reason="Function 'add_checked' has no kernel matching input types", + raises=pa.ArrowNotImplementedError, +) def test_duration_overflow_from_ndarray_containing_nat(): # GH52843 data_ts = pd.to_datetime([1, None])