Implement groupby.head and groupby.tail

- Closes rapidsai#2592 - Closes rapidsai#12245
wence- · Mar 14, 2023 · caa2970 · caa2970
1 parent 3584739
commit caa2970
Show file tree

Hide file tree

Showing 2 changed files with 148 additions and 4 deletions.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -608,6 +608,98 @@ def _scan(self, op: str, *args, **kwargs):
 
     aggregate = agg
 
+    def _head_tail(self, n, *, take_head=True):
+        """Return the head or tail of each group
+
+        Parameters
+        ----------
+        n
+           Number of entries to include (if negative, number of
+           entries to exclude)
+        take_head
+           Do we want the head or the tail of the group
+
+        Returns
+        -------
+        New DataFrame or Series
+
+        Notes
+        -----
+        Unlike pandas, this returns an object in group order, not
+        original order
+        """
+        # A more memory-efficient implementation would merge the take
+        # into the grouping, but that probably requires a new
+        # aggregation scheme in libcudf. This is probably "fast
+        # enough" for most reasonable input sizes.
+        _, offsets, _, group_values = self._grouped()
+        group_offsets = np.asarray(offsets, dtype=np.int32)
+        size_per_group = np.diff(group_offsets)
+        # "Out of bounds" n for the group size either means no entries
+        # (negative) or all the entries (positive)
+        if n < 0:
+            size_per_group = np.maximum(
+                size_per_group + n, 0, out=size_per_group
+            )
+        else:
+            size_per_group = np.minimum(size_per_group, n, out=size_per_group)
+        if take_head:
+            group_offsets = group_offsets[:-1]
+        else:
+            group_offsets = group_offsets[1:] - size_per_group
+        to_take = np.arange(size_per_group.sum(), dtype=np.int32)
+        fixup = np.empty_like(size_per_group)
+        fixup[0] = 0
+        np.cumsum(size_per_group[:-1], out=fixup[1:])
+        to_take += np.repeat(group_offsets - fixup, size_per_group)
+        return group_values.iloc[to_take]
+
+    def head(self, n: int = 5):
+        """Return first n rows of each group
+
+        Parameters
+        ----------
+        n
+            If positive: number of entries to include from start of group
+            If negative: number of entries to exclude from end of group
+
+        Returns
+        -------
+        Series or DataFrame
+            Subset of the original grouped object as determined by n
+
+        .. pandas-compat::
+
+           Note that the returned object will be ordered group-wise
+           (with the same ordering as ``.apply(lambda x: x.head(n))``
+           though the original index is preserved, rather than in the
+           original input order.
+        """
+        return self._head_tail(n, take_head=True)
+
+    def tail(self, n: int = 5):
+        """Return last n rows of each group
+
+        Parameters
+        ----------
+        n
+            If positive: number of entries to include from end of group
+            If negative: number of entries to exclude from start of group
+
+        Returns
+        -------
+        Series or DataFrame
+            Subset of the original grouped object as determined by n
+
+        .. pandas-compat::
+
+           Note that the returned object will be ordered group-wise
+           (with the same ordering as ``.apply(lambda x: x.tail(n))``
+           though the original index is preserved, rather than in the
+           original input order.
+        """
+        return self._head_tail(n, take_head=False)
+
     def nth(self, n):
         """
         Return the nth row from each group.

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -2,6 +2,7 @@
 
 import datetime
 import itertools
+import operator
 import textwrap
 from decimal import Decimal
 
@@ -1474,7 +1475,6 @@ def test_grouping(grouper):
 @pytest.mark.parametrize("agg", [lambda x: x.count(), "count"])
 @pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]])
 def test_groupby_count(agg, by):
-
     pdf = pd.DataFrame(
         {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]}
     )
@@ -1540,7 +1540,6 @@ def test_groupby_nth(n, by):
     reason="https://github.com/pandas-dev/pandas/issues/43209",
 )
 def test_raise_data_error():
-
     pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
     gdf = cudf.from_pandas(pdf)
 
@@ -1551,7 +1550,6 @@ def test_raise_data_error():
 
 
 def test_drop_unsupported_multi_agg():
-
     gdf = cudf.DataFrame(
         {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]}
     )
@@ -2567,7 +2565,6 @@ def foo(x):
     ],
 )
 def test_groupby_apply_series_args(func, args):
-
     got = make_frame(DataFrame, 100).groupby("x").y.apply(func, *args)
     expect = (
         make_frame(pd.DataFrame, 100)
@@ -2963,3 +2960,58 @@ def test_groupby_dtypes(groups):
     pdf = df.to_pandas()
 
     assert_eq(pdf.groupby(groups).dtypes, df.groupby(groups).dtypes)
+
+
+class TestHeadTail:
+    @pytest.fixture(params=[-3, -2, -1, 0, 1, 2, 3])
+    def n(self, request):
+        return request.param
+
+    @pytest.fixture
+    def df(self):
+        return cudf.DataFrame(
+            {
+                "a": [1, 0, 1, 2, 2, 1, 3, 2, 3, 3, 3],
+                "b": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+            }
+        )
+
+    @pytest.fixture(params=[True, False])
+    def take_head(self, request):
+        return request.param
+
+    @pytest.fixture
+    def expected(self, df, n, take_head):
+        if n == 0:
+            # We'll get an empty dataframe in this case
+            return df.iloc[:0]
+        else:
+            # We groupby "a" which is the first column
+            keyfunc = operator.itemgetter(0)
+            if take_head or n == 0:
+                # Head does group[:n] as does tail for n == 0
+                slicefunc = operator.itemgetter(slice(None, n))
+            else:
+                # Tail does group[-n:] except when n == 0
+                slicefunc = operator.itemgetter(
+                    slice(-n, None) if n else slice(0)
+                )
+            expect_a, expect_b = zip(
+                *itertools.chain.from_iterable(
+                    slicefunc(list(group))
+                    for _, group in itertools.groupby(
+                        sorted(df.values_host.tolist(), key=keyfunc),
+                        key=keyfunc,
+                    )
+                )
+            )
+            return cudf.DataFrame(
+                {"a": expect_a, "b": expect_b}, index=cudf.Index(expect_b)
+            )
+
+    def test_head_tail(self, df, n, take_head, expected):
+        if take_head:
+            actual = df.groupby("a").head(n=n)
+        else:
+            actual = df.groupby("a").tail(n=n)
+        assert_eq(actual, expected)