From f5ec4b23940195812136274e8d31b11951c5b294 Mon Sep 17 00:00:00 2001 From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com> Date: Wed, 16 Feb 2022 22:05:30 -0800 Subject: [PATCH] Implement DataFrame pct_change (#9805) Fixes: https://github.com/rapidsai/cudf/issues/9603 Next steps: - [ ] Fix https://github.com/rapidsai/cudf/issues/10314 - [ ] Move `diff `and `pct_change` to `indexed_frame` so that both are accessible to `Series` as well Authors: - Sheilah Kirui (https://github.com/skirui-source) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/9805 --- python/cudf/cudf/core/column/column.py | 8 ++++ python/cudf/cudf/core/dataframe.py | 40 +++++++++++++++++++ python/cudf/cudf/tests/test_dataframe.py | 49 +++++++++++++++++++----- 3 files changed, 87 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 393afe4a5b9..1c1c2ef2bf6 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -335,6 +335,14 @@ def _fill( return self def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: + # libcudf currently doesn't handle case when offset > len(df) + # ticket to fix the bug in link below: + # https://github.com/rapidsai/cudf/issues/10314 + if abs(offset) > len(self): + if fill_value is None: + return column_empty_like(self, masked=True) + else: + return full(len(self), fill_value, dtype=self.dtype) return libcudf.copying.shift(self, offset, fill_value) @property diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index efff67b686a..a4de6db9bda 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6196,6 +6196,46 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) + def pct_change( + self, periods=1, fill_method="ffill", limit=None, freq=None + ): + """ + Calculates the percent change between sequential elements + in the DataFrame. + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming percent change. + fill_method : str, default 'ffill' + How to handle NAs before computing percent changes. + limit : int, optional + The number of consecutive NAs to fill before stopping. + Not yet implemented. + freq : str, optional + Increment to use from time series API. + Not yet implemented. + + Returns + ------- + DataFrame + """ + if limit is not None: + raise NotImplementedError("limit parameter not supported yet.") + if freq is not None: + raise NotImplementedError("freq parameter not supported yet.") + elif fill_method not in {"ffill", "pad", "bfill", "backfill"}: + raise ValueError( + "fill_method must be one of 'ffill', 'pad', " + "'bfill', or 'backfill'." + ) + + data = self.fillna(method=fill_method, limit=limit) + + return data.diff(periods=periods) / data.shift( + periods=periods, freq=freq + ) + def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True ): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 729d1a5ca37..f765c614907 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3442,29 +3442,37 @@ def test_get_numeric_data(): @pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20]) +@pytest.mark.parametrize("period", [-15, -1, 0, 1, 15]) @pytest.mark.parametrize("data_empty", [False, True]) def test_shift(dtype, period, data_empty): - + # TODO : this function currently tests for series.shift() + # but should instead test for dataframe.shift() if data_empty: data = None else: if dtype == np.int8: # to keep data in range - data = gen_rand(dtype, 100000, low=-2, high=2) + data = gen_rand(dtype, 10, low=-2, high=2) else: - data = gen_rand(dtype, 100000) + data = gen_rand(dtype, 10) - gdf = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) - pdf = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) + gs = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) + ps = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) - shifted_outcome = gdf.a.shift(period).fillna(0) - expected_outcome = pdf.a.shift(period).fillna(0).astype(dtype) + shifted_outcome = gs.a.shift(period) + expected_outcome = ps.a.shift(period) + # pandas uses NaNs to signal missing value and force converts the + # results columns to float types if data_empty: - assert_eq(shifted_outcome, expected_outcome, check_index_type=False) + assert_eq( + shifted_outcome, + expected_outcome, + check_index_type=False, + check_dtype=False, + ) else: - assert_eq(shifted_outcome, expected_outcome) + assert_eq(shifted_outcome, expected_outcome, check_dtype=False) @pytest.mark.parametrize("dtype", NUMERIC_TYPES) @@ -9295,3 +9303,24 @@ def test_dataframe_rename_duplicate_column(): ValueError, match="Duplicate column names are not allowed" ): gdf.rename(columns={"a": "b"}, inplace=True) + + +@pytest.mark.parametrize( + "data", + [ + np.random.RandomState(seed=10).randint(-50, 50, (10, 10)), + np.random.RandomState(seed=10).random_sample((4, 4)), + np.array([1.123, 2.343, 5.890, 0.0]), + {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]}, + ], +) +@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"]) +def test_dataframe_pct_change(data, periods, fill_method): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.pct_change(periods=periods, fill_method=fill_method) + expected = pdf.pct_change(periods=periods, fill_method=fill_method) + + assert_eq(expected, actual)