Skip to content

Commit

Permalink
Implement DataFrame pct_change (#9805)
Browse files Browse the repository at this point in the history
Fixes: #9603

Next steps:

- [ ] Fix #10314 
- [ ] Move `diff `and `pct_change` to `indexed_frame`  so that both are accessible to `Series` as well

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Michael Wang (https://github.com/isVoid)

URL: #9805
  • Loading branch information
skirui-source authored Feb 17, 2022
1 parent fe37c0e commit f5ec4b2
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 10 deletions.
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,14 @@ def _fill(
return self

def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase:
# libcudf currently doesn't handle case when offset > len(df)
# ticket to fix the bug in link below:
# https://github.com/rapidsai/cudf/issues/10314
if abs(offset) > len(self):
if fill_value is None:
return column_empty_like(self, masked=True)
else:
return full(len(self), fill_value, dtype=self.dtype)
return libcudf.copying.shift(self, offset, fill_value)

@property
Expand Down
40 changes: 40 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6196,6 +6196,46 @@ def explode(self, column, ignore_index=False):

return super()._explode(column, ignore_index)

def pct_change(
self, periods=1, fill_method="ffill", limit=None, freq=None
):
"""
Calculates the percent change between sequential elements
in the DataFrame.
Parameters
----------
periods : int, default 1
Periods to shift for forming percent change.
fill_method : str, default 'ffill'
How to handle NAs before computing percent changes.
limit : int, optional
The number of consecutive NAs to fill before stopping.
Not yet implemented.
freq : str, optional
Increment to use from time series API.
Not yet implemented.
Returns
-------
DataFrame
"""
if limit is not None:
raise NotImplementedError("limit parameter not supported yet.")
if freq is not None:
raise NotImplementedError("freq parameter not supported yet.")
elif fill_method not in {"ffill", "pad", "bfill", "backfill"}:
raise ValueError(
"fill_method must be one of 'ffill', 'pad', "
"'bfill', or 'backfill'."
)

data = self.fillna(method=fill_method, limit=limit)

return data.diff(periods=periods) / data.shift(
periods=periods, freq=freq
)

def __dataframe__(
self, nan_as_null: bool = False, allow_copy: bool = True
):
Expand Down
49 changes: 39 additions & 10 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3442,29 +3442,37 @@ def test_get_numeric_data():


@pytest.mark.parametrize("dtype", NUMERIC_TYPES)
@pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20])
@pytest.mark.parametrize("period", [-15, -1, 0, 1, 15])
@pytest.mark.parametrize("data_empty", [False, True])
def test_shift(dtype, period, data_empty):

# TODO : this function currently tests for series.shift()
# but should instead test for dataframe.shift()
if data_empty:
data = None
else:
if dtype == np.int8:
# to keep data in range
data = gen_rand(dtype, 100000, low=-2, high=2)
data = gen_rand(dtype, 10, low=-2, high=2)
else:
data = gen_rand(dtype, 100000)
data = gen_rand(dtype, 10)

gdf = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)})
pdf = pd.DataFrame({"a": pd.Series(data, dtype=dtype)})
gs = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)})
ps = pd.DataFrame({"a": pd.Series(data, dtype=dtype)})

shifted_outcome = gdf.a.shift(period).fillna(0)
expected_outcome = pdf.a.shift(period).fillna(0).astype(dtype)
shifted_outcome = gs.a.shift(period)
expected_outcome = ps.a.shift(period)

# pandas uses NaNs to signal missing value and force converts the
# results columns to float types
if data_empty:
assert_eq(shifted_outcome, expected_outcome, check_index_type=False)
assert_eq(
shifted_outcome,
expected_outcome,
check_index_type=False,
check_dtype=False,
)
else:
assert_eq(shifted_outcome, expected_outcome)
assert_eq(shifted_outcome, expected_outcome, check_dtype=False)


@pytest.mark.parametrize("dtype", NUMERIC_TYPES)
Expand Down Expand Up @@ -9295,3 +9303,24 @@ def test_dataframe_rename_duplicate_column():
ValueError, match="Duplicate column names are not allowed"
):
gdf.rename(columns={"a": "b"}, inplace=True)


@pytest.mark.parametrize(
"data",
[
np.random.RandomState(seed=10).randint(-50, 50, (10, 10)),
np.random.RandomState(seed=10).random_sample((4, 4)),
np.array([1.123, 2.343, 5.890, 0.0]),
{"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]},
],
)
@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5])
@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"])
def test_dataframe_pct_change(data, periods, fill_method):
gdf = cudf.DataFrame(data)
pdf = gdf.to_pandas()

actual = gdf.pct_change(periods=periods, fill_method=fill_method)
expected = pdf.pct_change(periods=periods, fill_method=fill_method)

assert_eq(expected, actual)

0 comments on commit f5ec4b2

Please sign in to comment.