Skip to content
/ cudf Public
forked from rapidsai/cudf

Commit

Permalink
Implement groupby.head and groupby.tail
Browse files Browse the repository at this point in the history
- Closes rapidsai#2592
- Closes rapidsai#12245
  • Loading branch information
wence- committed Mar 14, 2023
1 parent 3584739 commit caa2970
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 4 deletions.
92 changes: 92 additions & 0 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,98 @@ def _scan(self, op: str, *args, **kwargs):

aggregate = agg

def _head_tail(self, n, *, take_head=True):
"""Return the head or tail of each group
Parameters
----------
n
Number of entries to include (if negative, number of
entries to exclude)
take_head
Do we want the head or the tail of the group
Returns
-------
New DataFrame or Series
Notes
-----
Unlike pandas, this returns an object in group order, not
original order
"""
# A more memory-efficient implementation would merge the take
# into the grouping, but that probably requires a new
# aggregation scheme in libcudf. This is probably "fast
# enough" for most reasonable input sizes.
_, offsets, _, group_values = self._grouped()
group_offsets = np.asarray(offsets, dtype=np.int32)
size_per_group = np.diff(group_offsets)
# "Out of bounds" n for the group size either means no entries
# (negative) or all the entries (positive)
if n < 0:
size_per_group = np.maximum(
size_per_group + n, 0, out=size_per_group
)
else:
size_per_group = np.minimum(size_per_group, n, out=size_per_group)
if take_head:
group_offsets = group_offsets[:-1]
else:
group_offsets = group_offsets[1:] - size_per_group
to_take = np.arange(size_per_group.sum(), dtype=np.int32)
fixup = np.empty_like(size_per_group)
fixup[0] = 0
np.cumsum(size_per_group[:-1], out=fixup[1:])
to_take += np.repeat(group_offsets - fixup, size_per_group)
return group_values.iloc[to_take]

def head(self, n: int = 5):
"""Return first n rows of each group
Parameters
----------
n
If positive: number of entries to include from start of group
If negative: number of entries to exclude from end of group
Returns
-------
Series or DataFrame
Subset of the original grouped object as determined by n
.. pandas-compat::
Note that the returned object will be ordered group-wise
(with the same ordering as ``.apply(lambda x: x.head(n))``
though the original index is preserved, rather than in the
original input order.
"""
return self._head_tail(n, take_head=True)

def tail(self, n: int = 5):
"""Return last n rows of each group
Parameters
----------
n
If positive: number of entries to include from end of group
If negative: number of entries to exclude from start of group
Returns
-------
Series or DataFrame
Subset of the original grouped object as determined by n
.. pandas-compat::
Note that the returned object will be ordered group-wise
(with the same ordering as ``.apply(lambda x: x.tail(n))``
though the original index is preserved, rather than in the
original input order.
"""
return self._head_tail(n, take_head=False)

def nth(self, n):
"""
Return the nth row from each group.
Expand Down
60 changes: 56 additions & 4 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import datetime
import itertools
import operator
import textwrap
from decimal import Decimal

Expand Down Expand Up @@ -1474,7 +1475,6 @@ def test_grouping(grouper):
@pytest.mark.parametrize("agg", [lambda x: x.count(), "count"])
@pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]])
def test_groupby_count(agg, by):

pdf = pd.DataFrame(
{"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]}
)
Expand Down Expand Up @@ -1540,7 +1540,6 @@ def test_groupby_nth(n, by):
reason="https://github.com/pandas-dev/pandas/issues/43209",
)
def test_raise_data_error():

pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
gdf = cudf.from_pandas(pdf)

Expand All @@ -1551,7 +1550,6 @@ def test_raise_data_error():


def test_drop_unsupported_multi_agg():

gdf = cudf.DataFrame(
{"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]}
)
Expand Down Expand Up @@ -2567,7 +2565,6 @@ def foo(x):
],
)
def test_groupby_apply_series_args(func, args):

got = make_frame(DataFrame, 100).groupby("x").y.apply(func, *args)
expect = (
make_frame(pd.DataFrame, 100)
Expand Down Expand Up @@ -2963,3 +2960,58 @@ def test_groupby_dtypes(groups):
pdf = df.to_pandas()

assert_eq(pdf.groupby(groups).dtypes, df.groupby(groups).dtypes)


class TestHeadTail:
@pytest.fixture(params=[-3, -2, -1, 0, 1, 2, 3])
def n(self, request):
return request.param

@pytest.fixture
def df(self):
return cudf.DataFrame(
{
"a": [1, 0, 1, 2, 2, 1, 3, 2, 3, 3, 3],
"b": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
}
)

@pytest.fixture(params=[True, False])
def take_head(self, request):
return request.param

@pytest.fixture
def expected(self, df, n, take_head):
if n == 0:
# We'll get an empty dataframe in this case
return df.iloc[:0]
else:
# We groupby "a" which is the first column
keyfunc = operator.itemgetter(0)
if take_head or n == 0:
# Head does group[:n] as does tail for n == 0
slicefunc = operator.itemgetter(slice(None, n))
else:
# Tail does group[-n:] except when n == 0
slicefunc = operator.itemgetter(
slice(-n, None) if n else slice(0)
)
expect_a, expect_b = zip(
*itertools.chain.from_iterable(
slicefunc(list(group))
for _, group in itertools.groupby(
sorted(df.values_host.tolist(), key=keyfunc),
key=keyfunc,
)
)
)
return cudf.DataFrame(
{"a": expect_a, "b": expect_b}, index=cudf.Index(expect_b)
)

def test_head_tail(self, df, n, take_head, expected):
if take_head:
actual = df.groupby("a").head(n=n)
else:
actual = df.groupby("a").tail(n=n)
assert_eq(actual, expected)

0 comments on commit caa2970

Please sign in to comment.