From 6a9785827d2e144e4ba38913f0f70bc587c66576 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 8 Sep 2022 18:31:38 -0400
Subject: [PATCH 01/25] Fix regex negated classes to not automatically include
 new-lines (#11644)

Fixes the regex negated class NCCLASS builder to not automatically include the new-line character in the negated range.
This logic is from the original plan9 source but does not appear to be honored by any other known regex implementation.
Marking this as a breaking change since it changes the existing behavior in case someone is depending on the current logic.
Also added a new gtest to include `\n` data when matching with a negated class.

Closes #11643

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Elias Stehle (https://github.com/elstehle)

URL: https://github.com/rapidsai/cudf/pull/11644
---
 cpp/src/strings/regex/regcomp.cpp    |  3 ---
 cpp/tests/strings/contains_tests.cpp | 17 +++++++++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index c84b1e630c9..a8f8b619a68 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -298,9 +298,6 @@ class regex_parser {
     if (!is_quoted && chr == '^') {
       type                     = NCCLASS;
       std::tie(is_quoted, chr) = next_char();
-      // negated classes also don't match '\n'
-      literals.push_back('\n');
-      literals.push_back('\n');
     }
 
     // parse class into a set of spans
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 960ccaec274..9ca4fbb6cb7 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -460,6 +460,23 @@ TEST_F(StringsContainsTests, OverlappedClasses)
   }
 }
 
+TEST_F(StringsContainsTests, NegatedClasses)
+{
+  auto input = cudf::test::strings_column_wrapper({"abcdefg", "def\tghí", "", "éeé\néeé", "ABC"});
+  auto sv    = cudf::strings_column_view(input);
+
+  {
+    auto results = cudf::strings::count_re(sv, "[^a-f]");
+    cudf::test::fixed_width_column_wrapper<int32_t> expected({1, 4, 0, 5, 3});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    auto results = cudf::strings::count_re(sv, "[^a-eá-é]");
+    cudf::test::fixed_width_column_wrapper<int32_t> expected({2, 5, 0, 1, 3});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}
+
 TEST_F(StringsContainsTests, IncompleteClassesRange)
 {
   auto input = cudf::test::strings_column_wrapper({"abc-def", "---", "", "ghijkl", "-wxyz-"});

From d6d8d92dc03e302fe6ece24632ed35a453071ffe Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 8 Sep 2022 23:15:15 -0400
Subject: [PATCH 02/25] Fix invalid regex quantifier check to not include
 alternation (#11654)

Fixes the check for invalid quantifier pattern to not include the alternation character. This moves the alternation instruction resolution before the quantifier validation checks.
Added a gtest for this pattern as well.

Closes #11650

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11654
---
 cpp/src/strings/regex/regcomp.cpp         |  4 +++-
 cpp/tests/strings/replace_regex_tests.cpp | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index a8f8b619a68..9b1013bae09 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -556,6 +556,9 @@ class regex_parser {
     // are treated as regex expressions and sometimes they are not.
     if (_items.empty()) { CUDF_FAIL("invalid regex pattern: nothing to repeat at position 0"); }
 
+    // handle alternation instruction
+    if (chr == '|') return OR;
+
     // Check that the previous item can be used with quantifiers.
     // If the previous item is a capture group, we need to check items inside the
     // capture group can be used with quantifiers too.
@@ -676,7 +679,6 @@ class regex_parser {
         // otherwise, fixed counted quantifier
         return COUNTED;
       }
-      case '|': return OR;
     }
     _chr = chr;
     return CHAR;
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index eb15745e473..79d968b14ad 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -162,6 +162,20 @@ TEST_F(StringsReplaceRegexTest, WordBoundary)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
+TEST_F(StringsReplaceRegexTest, Alternation)
+{
+  cudf::test::strings_column_wrapper input(
+    {"16  6  brr  232323  1  hello  90", "123 ABC 00 2022", "abé123  4567  89xyz"});
+  auto results = cudf::strings::replace_re(
+    cudf::strings_column_view(input), "(^|\\s)\\d+(\\s|$)", cudf::string_scalar("_"));
+  auto expected =
+    cudf::test::strings_column_wrapper({"__ brr __ hello _", "_ABC_2022", "abé123 _ 89xyz"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  results = cudf::strings::replace_re(
+    cudf::strings_column_view(input), "(\\s|^)\\d+($|\\s)", cudf::string_scalar("_"));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+}
+
 TEST_F(StringsReplaceRegexTest, ZeroLengthMatch)
 {
   cudf::test::strings_column_wrapper input({"DD", "zéz", "DsDs", ""});

From c8f57dd4e2342a2694597be5184b6dfe20d6ff76 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <tribizel@nvidia.com>
Date: Fri, 9 Sep 2022 09:00:42 +0200
Subject: [PATCH 03/25] Add `gdb` pretty-printers for simple types (#11499)

This adds `gdb` pretty printers for `rmm::device_uvector`, `thrust::*_vector`, `thrust::device_reference` and `cudf::*_span`. The implementation is based on https://github.com/NVIDIA/thrust/pull/1631. I will probably backport the thrust-specific changes to there as well, but since the location of the thrust source is not fixed, I'd prefer having all types in a self-contained file.

Authors:
  - Tobias Ribizel (https://github.com/upsj)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/11499
---
 cpp/CMakeLists.txt                  |  5 ++
 cpp/scripts/gdb-pretty-printers.py  | 84 +++++++++++++++++++++++++++++
 cpp/scripts/load-pretty-printers.in |  3 ++
 3 files changed, 92 insertions(+)
 create mode 100644 cpp/scripts/gdb-pretty-printers.py
 create mode 100644 cpp/scripts/load-pretty-printers.in

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 09a60836fef..69394a34624 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -749,6 +749,11 @@ if(CUDF_BUILD_BENCHMARKS)
   add_subdirectory(benchmarks)
 endif()
 
+# build pretty-printer load script
+if(Thrust_SOURCE_DIR AND rmm_SOURCE_DIR)
+  configure_file(scripts/load-pretty-printers.in load-pretty-printers @ONLY)
+endif()
+
 # ##################################################################################################
 # * install targets -------------------------------------------------------------------------------
 rapids_cmake_install_lib_dir(lib_dir)
diff --git a/cpp/scripts/gdb-pretty-printers.py b/cpp/scripts/gdb-pretty-printers.py
new file mode 100644
index 00000000000..ebb56a8c9e6
--- /dev/null
+++ b/cpp/scripts/gdb-pretty-printers.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import gdb
+
+global_locals = locals()
+if not all(
+    name in global_locals
+    for name in (
+        "HostIterator",
+        "DeviceIterator",
+        "is_template_type_not_alias",
+        "template_match",
+    )
+):
+    raise NameError(
+        "This file expects the RMM pretty-printers to be loaded already. "
+        "Either load them manually, or use the generated load-pretty-printers "
+        "script in the build directory"
+    )
+
+
+class CudfHostSpanPrinter(gdb.printing.PrettyPrinter):
+    """Print a cudf::host_span"""
+
+    def __init__(self, val):
+        self.val = val
+        self.pointer = val["_data"]
+        self.size = int(val["_size"])
+
+    def children(self):
+        return HostIterator(self.pointer, self.size)
+
+    def to_string(self):
+        return f"{self.val.type} of length {self.size} at {hex(self.pointer)}"
+
+    def display_hint(self):
+        return "array"
+
+
+class CudfDeviceSpanPrinter(gdb.printing.PrettyPrinter):
+    """Print a cudf::device_span"""
+
+    def __init__(self, val):
+        self.val = val
+        self.pointer = val["_data"]
+        self.size = int(val["_size"])
+
+    def children(self):
+        return DeviceIterator(self.pointer, self.size)
+
+    def to_string(self):
+        return f"{self.val.type} of length {self.size} at {hex(self.pointer)}"
+
+    def display_hint(self):
+        return "array"
+
+
+def lookup_cudf_type(val):
+    if not str(val.type.unqualified()).startswith("cudf::"):
+        return None
+    suffix = str(val.type.unqualified())[6:]
+    if not is_template_type_not_alias(suffix):
+        return None
+    if template_match(suffix, "host_span"):
+        return CudfHostSpanPrinter(val)
+    if template_match(suffix, "device_span"):
+        return CudfDeviceSpanPrinter(val)
+    return None
+
+
+gdb.pretty_printers.append(lookup_cudf_type)
diff --git a/cpp/scripts/load-pretty-printers.in b/cpp/scripts/load-pretty-printers.in
new file mode 100644
index 00000000000..4c00384c878
--- /dev/null
+++ b/cpp/scripts/load-pretty-printers.in
@@ -0,0 +1,3 @@
+source @Thrust_SOURCE_DIR@/scripts/gdb-pretty-printers.py
+source @rmm_SOURCE_DIR@/scripts/gdb-pretty-printers.py
+source @PROJECT_SOURCE_DIR@/scripts/gdb-pretty-printers.py

From 6cace8e769756474e11257e3de478ab1c071dd51 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Fri, 9 Sep 2022 08:48:01 -0500
Subject: [PATCH 04/25] Fix multi-file remote datasource bug (#11655)

The intention of this PR is to address the dataset-truncation bug described in #11324. It seems that cudf does not currently handle multiple pyarrow-based datasources correctly.  This means `cudf.read_parquet` will return incorrect results when reading a list of files from remote storage (e.g. from s3 or gcs).

**The underlying problem**: Instead of passing a vector of `datasource` objects to libcudf, the python/cython API only passes along a `datasource` object for the very **first** file in the user-specified list.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/11655
---
 python/cudf/cudf/_lib/cpp/io/types.pxd |  1 +
 python/cudf/cudf/_lib/io/utils.pyx     |  8 ++++---
 python/cudf/cudf/tests/test_s3.py      | 32 ++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd
index 8c8d2a2d7e4..21809ef7bd9 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/types.pxd
@@ -105,6 +105,7 @@ cdef extern from "cudf/io/types.hpp" \
         source_info(const vector[string] &filepaths) except +
         source_info(const vector[host_buffer] &host_buffers) except +
         source_info(datasource *source) except +
+        source_info(const vector[datasource*] &datasources) except +
 
     cdef cppclass sink_info:
         io_type type
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 8e345bf969b..18b26bb5aa6 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from cpython.buffer cimport PyBUF_READ
 from cpython.memoryview cimport PyMemoryView_FromMemory
@@ -40,6 +40,7 @@ cdef source_info make_source_info(list src) except*:
     cdef vector[host_buffer] c_host_buffers
     cdef vector[string] c_files
     cdef Datasource csrc
+    cdef vector[datasource*] c_datasources
     empty_buffer = False
     if isinstance(src[0], bytes):
         empty_buffer = True
@@ -58,8 +59,9 @@ cdef source_info make_source_info(list src) except*:
     # TODO (ptaylor): Might need to update this check if accepted input types
     #                 change when UCX and/or cuStreamz support is added.
     elif isinstance(src[0], Datasource):
-        csrc = src[0]
-        return source_info(csrc.get_datasource())
+        for csrc in src:
+            c_datasources.push_back(csrc.get_datasource())
+        return source_info(c_datasources)
     elif isinstance(src[0], (int, float, complex, basestring, os.PathLike)):
         # If source is a file, return source_info where type=FILEPATH
         if not all(os.path.isfile(file) for file in src):
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index b754429555d..1fdd2dae31d 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -311,6 +311,38 @@ def test_read_parquet_ext(
     assert_eq(expect, got1)
 
 
+def test_read_parquet_multi_file(s3_base, s3so, pdf):
+    fname_1 = "test_parquet_reader_multi_file_1.parquet"
+    buffer_1 = BytesIO()
+    pdf.to_parquet(path=buffer_1)
+    buffer_1.seek(0)
+
+    fname_2 = "test_parquet_reader_multi_file_2.parquet"
+    buffer_2 = BytesIO()
+    pdf.to_parquet(path=buffer_2)
+    buffer_2.seek(0)
+
+    bucket = "parquet"
+    with s3_context(
+        s3_base=s3_base,
+        bucket=bucket,
+        files={
+            fname_1: buffer_1,
+            fname_2: buffer_2,
+        },
+    ):
+        got = cudf.read_parquet(
+            [
+                f"s3://{bucket}/{fname_1}",
+                f"s3://{bucket}/{fname_2}",
+            ],
+            storage_options=s3so,
+        ).reset_index(drop=True)
+
+    expect = pd.concat([pdf, pdf], ignore_index=True)
+    assert_eq(expect, got)
+
+
 @pytest.mark.parametrize("columns", [None, ["Float", "String"]])
 def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
     # Write to buffer

From f4856679c270c289d471719b6cf330ccfde7f453 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 9 Sep 2022 18:02:26 +0100
Subject: [PATCH 05/25] Update to mypy 0.971 (#11640)

Update to current mypy, primarily since on Apple silicon, the previous pinned version of mypy is no longer installable (`typed-ast` does not build from source). This necessitates some minor updates to the type annotation rules, since the newer mypy version is a bit pickier.

While we're here, exclude directories we were previously just ignoring errors in.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Jordan Jacobelli (https://github.com/Ethyling)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/11640
---
 .pre-commit-config.yaml                      |  9 ++++-
 conda/environments/cudf_dev_cuda11.5.yml     |  3 +-
 python/cudf/cudf/_typing.py                  |  8 ++++
 python/cudf/cudf/core/column/string.py       |  8 +++-
 python/cudf/cudf/core/dataframe.py           |  5 +--
 python/cudf/cudf/core/index.py               |  2 +-
 python/cudf/cudf/core/indexed_frame.py       |  4 +-
 python/cudf/cudf/core/multiindex.py          |  2 +-
 python/cudf/cudf/core/series.py              | 11 +++--
 python/cudf/cudf/core/single_column_frame.py |  6 +--
 setup.cfg                                    | 42 ++++++--------------
 11 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4f838ba3f45..ce6163755d7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,10 +26,15 @@ repos:
                 files: python/.*\.(py|pyx|pxd)$
                 types: [file]
       - repo: https://github.com/pre-commit/mirrors-mypy
-        rev: 'v0.782'
+        rev: 'v0.971'
         hooks:
               - id: mypy
-                args: ["--config-file=setup.cfg", "python/cudf/cudf", "python/dask_cudf/dask_cudf", "python/custreamz/custreamz", "python/cudf_kafka/cudf_kafka"]
+                additional_dependencies: [types-cachetools]
+                args: ["--config-file=setup.cfg",
+                       "python/cudf/cudf",
+                       "python/custreamz/custreamz",
+                       "python/cudf_kafka/cudf_kafka",
+                       "python/dask_cudf/dask_cudf"]
                 pass_filenames: false
       - repo: https://github.com/PyCQA/pydocstyle
         rev: 6.1.1
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index d8281ca2b75..fb47af9939e 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -43,7 +43,8 @@ dependencies:
   - flake8=3.8.3
   - black=22.3.0
   - isort=5.10.1
-  - mypy=0.782
+  - mypy=0.971
+  - types-cachetools
   - doxygen=1.8.20
   - pydocstyle=6.1.1
   - typing_extensions
diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
index 87988150fd3..e2ea12a0e4d 100644
--- a/python/cudf/cudf/_typing.py
+++ b/python/cudf/cudf/_typing.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
+import sys
 from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, TypeVar, Union
 
 import numpy as np
@@ -9,6 +10,13 @@
 if TYPE_CHECKING:
     import cudf
 
+# Backwards compat: mypy >= 0.790 rejects Type[NotImplemented], but
+# NotImplementedType is only introduced in 3.10
+if sys.version_info >= (3, 10):
+    from types import NotImplementedType
+else:
+    NotImplementedType = Any
+
 # Many of these are from
 # https://github.com/pandas-dev/pandas/blob/master/pandas/_typing.py
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index e64e5e50ae5..172a1ed9edc 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -34,6 +34,7 @@
 )
 from cudf.core.buffer import DeviceBufferLike
 from cudf.core.column import column, datetime
+from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import can_convert_to_column
@@ -3643,7 +3644,12 @@ def isempty(self) -> SeriesOrIndex:
         4    False
         dtype: bool
         """
-        return self._return_or_inplace((self._column == "").fillna(False))
+        return self._return_or_inplace(
+            # mypy can't deduce that the return value of
+            # StringColumn.__eq__ is ColumnBase because the binops are
+            # dynamically added by a mixin class
+            cast(ColumnBase, self._column == "").fillna(False)
+        )
 
     def isspace(self) -> SeriesOrIndex:
         r"""
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 452c6858f92..d0356386f75 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -21,7 +21,6 @@
     Optional,
     Set,
     Tuple,
-    Type,
     TypeVar,
     Union,
 )
@@ -40,7 +39,7 @@
 import cudf
 import cudf.core.common
 from cudf import _lib as libcudf
-from cudf._typing import ColumnLike
+from cudf._typing import ColumnLike, NotImplementedType
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
@@ -1934,7 +1933,7 @@ def _make_operands_and_index_for_binop(
     ) -> Tuple[
         Union[
             Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
-            Type[NotImplemented],
+            NotImplementedType,
         ],
         Optional[BaseIndex],
     ]:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 0fdcabc0e8b..d1995615e0c 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -231,7 +231,7 @@ def step(self):
     def _num_rows(self):
         return len(self)
 
-    @cached_property
+    @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def _values(self):
         if len(self) > 0:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 5cb21d0dc40..9bda475589a 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -30,7 +30,7 @@
 
 import cudf
 import cudf._lib as libcudf
-from cudf._typing import ColumnLike, DataFrameOrSeries
+from cudf._typing import ColumnLike, DataFrameOrSeries, NotImplementedType
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     is_bool_dtype,
@@ -2991,7 +2991,7 @@ def _make_operands_and_index_for_binop(
     ) -> Tuple[
         Union[
             Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
-            Type[NotImplemented],
+            NotImplementedType,
         ],
         Optional[cudf.BaseIndex],
     ]:
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 3fb4238a8b6..06a2cc33c1f 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1458,7 +1458,7 @@ def from_pandas(cls, multiindex, nan_as_null=None):
         )
         return cls.from_frame(df, names=multiindex.names)
 
-    @cached_property
+    @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def is_unique(self):
         return len(self) == len(self.unique())
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index b31dd70a6dc..4ab28cab5a0 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -8,7 +8,7 @@
 import textwrap
 from collections import abc
 from shutil import get_terminal_size
-from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Type, Union
+from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Union
 
 import cupy
 import numpy as np
@@ -19,7 +19,12 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.scalar import _is_null_host_scalar
-from cudf._typing import ColumnLike, DataFrameOrSeries, ScalarLike
+from cudf._typing import (
+    ColumnLike,
+    DataFrameOrSeries,
+    NotImplementedType,
+    ScalarLike,
+)
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_scalar_or_zero_d_array,
@@ -1289,7 +1294,7 @@ def _make_operands_and_index_for_binop(
     ) -> Tuple[
         Union[
             Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
-            Type[NotImplemented],
+            NotImplementedType,
         ],
         Optional[BaseIndex],
     ]:
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 1dc7ae403c3..4d4d079cf41 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -4,14 +4,14 @@
 from __future__ import annotations
 
 import warnings
-from typing import Any, Dict, Optional, Tuple, Type, TypeVar, Union
+from typing import Any, Dict, Optional, Tuple, TypeVar, Union
 
 import cupy
 import numpy as np
 import pandas as pd
 
 import cudf
-from cudf._typing import Dtype, ScalarLike
+from cudf._typing import Dtype, NotImplementedType, ScalarLike
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
@@ -302,7 +302,7 @@ def _make_operands_for_binop(
         **kwargs,
     ) -> Union[
         Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
-        Type[NotImplemented],
+        NotImplementedType,
     ]:
         """Generate the dictionary of operands used for a binary operation.
 
diff --git a/setup.cfg b/setup.cfg
index b5be5cda653..182fb1e7805 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,33 +37,15 @@ select =
 
 [mypy]
 ignore_missing_imports = True
-
-[mypy-cudf._lib.*]
-ignore_errors = True
-
-[mypy-cudf._version]
-ignore_errors = True
-
-[mypy-cudf.utils.metadata.orc_column_statistics_pb2]
-ignore_errors = True
-
-[mypy-cudf.tests.*]
-ignore_errors = True
-
-[mypy-dask_cudf._version]
-ignore_errors = True
-
-[mypy-dask_cudf.tests.*]
-ignore_errors = True
-
-[mypy-custreamz._version]
-ignore_errors = True
-
-[mypy-custreamz.tests.*]
-ignore_errors = True
-
-[mypy-cudf_kafka._version]
-ignore_errors = True
-
-[mypy-cudf_kafka.tests.*]
-ignore_errors = True
+# If we don't specify this, then mypy will check excluded files if
+# they are imported by a checked file.
+follow_imports = skip
+exclude = (?x)(
+  (cudf|custreamz|cudf_kafka|dask_cudf)/_version\.py
+  | cudf/_lib/
+  | cudf/cudf/benchmarks/
+  | cudf/cudf/tests/
+  | custreamz/custreamz/tests/
+  | dask_cudf/dask_cudf/tests/
+  # This close paren cannot be in column zero otherwise the config parser barfs
+ )

From 9f8db66b130c75403c42a02dc83ee2166153cab7 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Fri, 9 Sep 2022 17:53:18 -0700
Subject: [PATCH 06/25] Maintain the index name after `.loc` (#11677)

Closes #11664

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/11677
---
 python/cudf/cudf/core/dataframe.py      | 6 ++++--
 python/cudf/cudf/tests/test_indexing.py | 4 ++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d0356386f75..c07a88e9396 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -304,7 +304,7 @@ def _getitem_tuple_arg(self, arg):
                 start = arg[0].start
                 if start is None:
                     start = self._frame.index[0]
-                df.index = as_index(start)
+                df.index = as_index(start, name=self._frame.index.name)
             else:
                 row_selection = as_column(arg[0])
                 if is_bool_dtype(row_selection.dtype):
@@ -312,7 +312,9 @@ def _getitem_tuple_arg(self, arg):
                         row_selection
                     )
                 else:
-                    df.index = as_index(row_selection)
+                    df.index = as_index(
+                        row_selection, name=self._frame.index.name
+                    )
         # Step 4: Downcast
         if self._can_downcast_to_series(df, arg):
             return self._downcast_to_series(df, arg)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index dbeb1204c73..d726ba16e86 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -286,6 +286,7 @@ def test_dataframe_loc(scalar, step):
             "d": np.random.random(size).astype(np.float64),
         }
     )
+    pdf.index.name = "index"
 
     df = cudf.DataFrame.from_pandas(pdf)
 
@@ -629,6 +630,9 @@ def test_dataframe_iloc(nelem):
     pdf["a"] = ha
     pdf["b"] = hb
 
+    gdf.index.name = "index"
+    pdf.index.name = "index"
+
     assert_eq(gdf.iloc[-1:1], pdf.iloc[-1:1])
     assert_eq(gdf.iloc[nelem - 1 : -1], pdf.iloc[nelem - 1 : -1])
     assert_eq(gdf.iloc[0 : nelem - 1], pdf.iloc[0 : nelem - 1])

From 9f9a55da78d5f6e95c0c4bd8323b307e3a10c650 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Mon, 12 Sep 2022 21:51:22 +0530
Subject: [PATCH 07/25] List lexicographic comparator (#11129)

Contributes to #10186

This PR enables lexicographic comparisons between list columns. The comparison is robust to arbitrary levels of nesting, but does not yet support lists of (lists of...) structs. The comparison is based on the Dremel encoding already in use in the Parquet file format. To assist reviewers, here is a reasonably complete list of the changes in this PR:
1. A helper function to get per-column Dremel data (for list columns) when constructing a preprocessed table, which now owns the Dremel data.
2. Updating the set of lexicographically compatible columns to now include list columns as long as they do not have any nested structs within.
3. An implementation of `lexicographic::device_row_comparator::operator()` for lists. **This function is the heart of the change to enable comparisons between list columns.**
4. A new benchmark for sorting that uses list data.
5. An update to a preexisting rolling collect set test that previously failed (because it requires list comparison) but now works.
6. New tests for list comparison.

Authors:
  - Devavret Makkar (https://github.com/devavret)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/11129
---
 cpp/benchmarks/CMakeLists.txt                 |   2 +-
 cpp/benchmarks/sort/sort_lists.cpp            |  49 +++
 cpp/include/cudf/lists/detail/dremel.hpp      |  14 +-
 .../cudf/table/experimental/row_operators.cuh | 293 +++++++++++++++---
 cpp/include/cudf/table/table_view.hpp         |   8 +
 .../binaryop/compiled/struct_binary_ops.cuh   |  14 +-
 cpp/src/search/search_ordered.cu              |  55 +++-
 cpp/src/sort/sort_impl.cuh                    |  36 ++-
 cpp/src/table/row_operators.cu                |  54 +++-
 cpp/src/table/table_view.cpp                  |   6 +
 cpp/tests/rolling/collect_ops_test.cpp        |  25 +-
 cpp/tests/sort/sort_test.cpp                  | 105 ++++++-
 .../table/experimental_row_operator_tests.cu  |  48 ++-
 13 files changed, 591 insertions(+), 118 deletions(-)
 create mode 100644 cpp/benchmarks/sort/sort_lists.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 6b3ecbfac1c..fb46d1b583e 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -165,7 +165,7 @@ ConfigureNVBench(SEARCH_NVBENCH search/contains.cpp)
 # ##################################################################################################
 # * sort benchmark --------------------------------------------------------------------------------
 ConfigureBench(SORT_BENCH sort/rank.cpp sort/sort.cpp sort/sort_strings.cpp)
-ConfigureNVBench(SORT_NVBENCH sort/sort_structs.cpp)
+ConfigureNVBench(SORT_NVBENCH sort/sort_lists.cpp sort/sort_structs.cpp)
 
 # ##################################################################################################
 # * quantiles benchmark
diff --git a/cpp/benchmarks/sort/sort_lists.cpp b/cpp/benchmarks/sort/sort_lists.cpp
new file mode 100644
index 00000000000..dac865de479
--- /dev/null
+++ b/cpp/benchmarks/sort/sort_lists.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+
+#include <cudf/detail/sorting.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_sort_lists(nvbench::state& state)
+{
+  cudf::rmm_pool_raii pool_raii;
+
+  const size_t size_bytes(state.get_int64("size_bytes"));
+  const cudf::size_type depth{static_cast<cudf::size_type>(state.get_int64("depth"))};
+  auto const null_frequency{state.get_float64("null_frequency")};
+
+  data_profile table_profile;
+  table_profile.set_distribution_params(cudf::type_id::LIST, distribution_id::UNIFORM, 0, 5);
+  table_profile.set_list_depth(depth);
+  table_profile.set_null_probability(null_frequency);
+  auto const table =
+    create_random_table({cudf::type_id::LIST}, table_size_bytes{size_bytes}, table_profile);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    cudf::detail::sorted_order(*table, {}, {}, stream_view, rmm::mr::get_current_device_resource());
+  });
+}
+
+NVBENCH_BENCH(nvbench_sort_lists)
+  .set_name("sort_list")
+  .add_int64_power_of_two_axis("size_bytes", {10, 18, 24, 28})
+  .add_int64_axis("depth", {1, 4})
+  .add_float64_axis("null_frequency", {0, 0.2});
diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp
index 4ddad4177be..4e3aeec2499 100644
--- a/cpp/include/cudf/lists/detail/dremel.hpp
+++ b/cpp/include/cudf/lists/detail/dremel.hpp
@@ -28,15 +28,11 @@ namespace cudf::detail {
  * @see the `dremel_data` struct for more info.
  */
 struct dremel_device_view {
-  // TODO: These elements are default initializable to support default
-  // initialization of the object. This is currently exploited to create views
-  // that will never actually be used. We should consider whether this
-  // represents a serious issue that should be worked around more robustly.
-  size_type const* offsets{};
-  uint8_t const* rep_levels{};
-  uint8_t const* def_levels{};
-  size_type const leaf_data_size{};
-  uint8_t const max_def_level{};
+  size_type const* offsets;
+  uint8_t const* rep_levels;
+  uint8_t const* def_levels;
+  size_type const leaf_data_size;
+  uint8_t const max_def_level;
 };
 
 /**
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index aad882fcc9a..af7091fc00c 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -22,6 +22,7 @@
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/list_device_view.cuh>
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/sorting.hpp>
@@ -210,6 +211,25 @@ struct sorting_physical_element_comparator {
   }
 };
 
+using optional_dremel_view = thrust::optional<detail::dremel_device_view const>;
+
+// The has_nested_columns template parameter of the device_row_comparator is
+// necessary to help the compiler optimize our code. Without it, the list and
+// struct view specializations are present in the code paths used for primitive
+// types, and the compiler fails to inline this nearly as well resulting in a
+// significant performance drop.  As a result, there is some minor tension in
+// the current design between the presence of this parameter and the way that
+// the Dremel data is passed around, first as a
+// std::optional<device_span<dremel_device_view>> in the
+// preprocessed_table/device_row_comparator (which is always valid when
+// has_nested_columns and is otherwise invalid) that is then unpacked to a
+// thrust::optional<dremel_device_view> at the element_comparator level (which
+// is always valid for a list column and otherwise invalid).  We cannot use an
+// additional template parameter for the element_comparator on a per-column
+// basis because we cannot conditionally define dremel_device_view member
+// variables without jumping through extra hoops with inheritance, so the
+// thrust::optional<dremel_device_view> member must be an optional rather than
+// a raw dremel_device_view.
 /**
  * @brief Computes the lexicographic comparison between 2 rows.
  *
@@ -230,7 +250,8 @@ struct sorting_physical_element_comparator {
  * rather than logical elements, defaults to `NaN` aware relational comparator that evaluates `NaN`
  * as greater than all other values.
  */
-template <typename Nullate,
+template <bool has_nested_columns,
+          typename Nullate,
           typename PhysicalElementComparator = sorting_physical_element_comparator>
 class device_row_comparator {
   friend class self_comparator;       ///< Allow self_comparator to access private members
@@ -256,12 +277,16 @@ class device_row_comparator {
   device_row_comparator(Nullate check_nulls,
                         table_device_view lhs,
                         table_device_view rhs,
+                        device_span<detail::dremel_device_view const> l_dremel_device_views,
+                        device_span<detail::dremel_device_view const> r_dremel_device_views,
                         std::optional<device_span<int const>> depth                  = std::nullopt,
                         std::optional<device_span<order const>> column_order         = std::nullopt,
                         std::optional<device_span<null_order const>> null_precedence = std::nullopt,
                         PhysicalElementComparator comparator                         = {}) noexcept
     : _lhs{lhs},
       _rhs{rhs},
+      _l_dremel(l_dremel_device_views),
+      _r_dremel(r_dremel_device_views),
       _check_nulls{check_nulls},
       _depth{depth},
       _column_order{column_order},
@@ -292,14 +317,18 @@ class device_row_comparator {
     __device__ element_comparator(Nullate check_nulls,
                                   column_device_view lhs,
                                   column_device_view rhs,
-                                  null_order null_precedence           = null_order::BEFORE,
-                                  int depth                            = 0,
-                                  PhysicalElementComparator comparator = {})
+                                  null_order null_precedence                = null_order::BEFORE,
+                                  int depth                                 = 0,
+                                  PhysicalElementComparator comparator      = {},
+                                  optional_dremel_view l_dremel_device_view = {},
+                                  optional_dremel_view r_dremel_device_view = {})
       : _lhs{lhs},
         _rhs{rhs},
         _check_nulls{check_nulls},
         _null_precedence{null_precedence},
         _depth{depth},
+        _l_dremel_device_view{l_dremel_device_view},
+        _r_dremel_device_view{r_dremel_device_view},
         _comparator{comparator}
     {
     }
@@ -333,14 +362,15 @@ class device_row_comparator {
 
     template <typename Element,
               CUDF_ENABLE_IF(not cudf::is_relationally_comparable<Element, Element>() and
-                             not std::is_same_v<Element, cudf::struct_view>)>
+                             (not has_nested_columns or not cudf::is_nested<Element>()))>
     __device__ cuda::std::pair<weak_ordering, int> operator()(size_type const,
                                                               size_type const) const noexcept
     {
       CUDF_UNREACHABLE("Attempted to compare elements of uncomparable types.");
     }
 
-    template <typename Element, CUDF_ENABLE_IF(std::is_same_v<Element, cudf::struct_view>)>
+    template <typename Element,
+              CUDF_ENABLE_IF(has_nested_columns and std::is_same_v<Element, cudf::struct_view>)>
     __device__ cuda::std::pair<weak_ordering, int> operator()(
       size_type const lhs_element_index, size_type const rhs_element_index) const noexcept
     {
@@ -373,12 +403,103 @@ class device_row_comparator {
         rhs_element_index);
     }
 
+    template <typename Element,
+              CUDF_ENABLE_IF(has_nested_columns and std::is_same_v<Element, cudf::list_view>)>
+    __device__ cuda::std::pair<weak_ordering, int> operator()(size_type lhs_element_index,
+                                                              size_type rhs_element_index)
+    {
+      // These are all the values from the Dremel encoding.
+      auto const l_max_def_level = _l_dremel_device_view->max_def_level;
+      auto const l_def_levels    = _l_dremel_device_view->def_levels;
+      auto const r_def_levels    = _r_dremel_device_view->def_levels;
+      auto const l_rep_levels    = _l_dremel_device_view->rep_levels;
+      auto const r_rep_levels    = _r_dremel_device_view->rep_levels;
+
+      // Traverse the nested list hierarchy to get a column device view
+      // pointing to the underlying child data.
+      column_device_view lcol = _lhs.slice(lhs_element_index, 1);
+      column_device_view rcol = _rhs.slice(rhs_element_index, 1);
+      while (lcol.type().id() == type_id::LIST) {
+        lcol = detail::lists_column_device_view(lcol).get_sliced_child();
+        rcol = detail::lists_column_device_view(rcol).get_sliced_child();
+      }
+
+      // These start and end values indicate the start and end points of all
+      // the elements of the lists in the current list element
+      // (`[lhs|rhs]_element_index`) that we are comparing.
+      auto const l_offsets = _l_dremel_device_view->offsets;
+      auto const r_offsets = _r_dremel_device_view->offsets;
+      auto l_start         = l_offsets[lhs_element_index];
+      auto l_end           = l_offsets[lhs_element_index + 1];
+      auto r_start         = r_offsets[rhs_element_index];
+      auto r_end           = r_offsets[rhs_element_index + 1];
+
+      // This comparator will be used to compare leaf (non-nested) data types.
+      auto comparator =
+        element_comparator{_check_nulls, lcol, rcol, _null_precedence, _depth, _comparator};
+
+      // Loop over each element in the encoding. Note that this includes nulls
+      // and empty lists, so not every index corresponds to an actual element
+      // in the child column. The element_index is used to keep track of the current
+      // child element that we're actually comparing.
+      weak_ordering state{weak_ordering::EQUIVALENT};
+      for (int l_dremel_index = l_start, r_dremel_index = r_start, element_index = 0;
+           l_dremel_index < l_end and r_dremel_index < r_end;
+           ++l_dremel_index, ++r_dremel_index) {
+        // First early exit: the definition levels do not match.
+        if (l_def_levels[l_dremel_index] != r_def_levels[r_dremel_index]) {
+          state = (l_def_levels[l_dremel_index] < r_def_levels[r_dremel_index])
+                    ? weak_ordering::LESS
+                    : weak_ordering::GREATER;
+          return cuda::std::pair(state, _depth);
+        }
+
+        // Second early exit: the repetition levels do not match.
+        if (l_rep_levels[l_dremel_index] != r_rep_levels[r_dremel_index]) {
+          state = (l_rep_levels[l_dremel_index] < r_rep_levels[r_dremel_index])
+                    ? weak_ordering::LESS
+                    : weak_ordering::GREATER;
+          return cuda::std::pair(state, _depth);
+        }
+
+        // Third early exit: This case has two branches.
+        // 1) If we are at the maximum definition level, then we actually have
+        //    an underlying element to compare, not just an empty list or a
+        //    null. Therefore, we access the element_index element of each list
+        //    and compare the values.
+        // 2) If we are one level below the maximum definition level and the
+        //    column is nullable, the current element must be a null in the
+        //    leaf data. In this case we ignore the null and skip to the next
+        //    element.
+        if (l_def_levels[l_dremel_index] == l_max_def_level) {
+          int last_null_depth                    = _depth;
+          cuda::std::tie(state, last_null_depth) = cudf::type_dispatcher<dispatch_void_if_nested>(
+            lcol.type(), comparator, element_index, element_index);
+          if (state != weak_ordering::EQUIVALENT) { return cuda::std::pair(state, _depth); }
+          ++element_index;
+        } else if (lcol.nullable() and l_def_levels[l_dremel_index] == l_max_def_level - 1) {
+          ++element_index;
+        }
+      }
+
+      // If we have reached this stage, we know that definition levels,
+      // repetition levels, and actual elements are identical in both list
+      // columns up to the `min(l_end - l_start, r_end - r_start)` element of
+      // the Dremel encoding. However, two lists can only compare equivalent if
+      // they are of the same length. Otherwise, the shorter of the two is less
+      // than the longer. This final check determines the appropriate resulting
+      // ordering by checking how many total elements each list is composed of.
+      return cuda::std::pair(detail::compare_elements(l_end - l_start, r_end - r_start), _depth);
+    }
+
    private:
     column_device_view const _lhs;
     column_device_view const _rhs;
     Nullate const _check_nulls;
     null_order const _null_precedence;
     int const _depth;
+    optional_dremel_view _l_dremel_device_view;
+    optional_dremel_view _r_dremel_device_view;
     PhysicalElementComparator const _comparator;
   };
 
@@ -396,6 +517,7 @@ class device_row_comparator {
                                                 size_type const rhs_index) const noexcept
   {
     int last_null_depth = std::numeric_limits<int>::max();
+    size_type list_column_index{0};
     for (size_type i = 0; i < _lhs.num_columns(); ++i) {
       int const depth = _depth.has_value() ? (*_depth)[i] : 0;
       if (depth > last_null_depth) { continue; }
@@ -406,13 +528,30 @@ class device_row_comparator {
       null_order const null_precedence =
         _null_precedence.has_value() ? (*_null_precedence)[i] : null_order::BEFORE;
 
+      // TODO: At what point do we verify that the columns of lhs and rhs are
+      // all of the same types? I assume that it's already happened before
+      // here, otherwise the current code would be failing.
+      auto [l_dremel_i, r_dremel_i] = [&]() {
+        if (_lhs.column(i).type().id() == type_id::LIST) {
+          auto idx = list_column_index++;
+          return std::make_tuple(optional_dremel_view(_l_dremel[idx]),
+                                 optional_dremel_view(_r_dremel[idx]));
+        } else {
+          return std::make_tuple(optional_dremel_view{}, optional_dremel_view{});
+        }
+      }();
+      auto element_comp = element_comparator{_check_nulls,
+                                             _lhs.column(i),
+                                             _rhs.column(i),
+                                             null_precedence,
+                                             depth,
+                                             _comparator,
+                                             l_dremel_i,
+                                             r_dremel_i};
+
       weak_ordering state;
-      cuda::std::tie(state, last_null_depth) = cudf::type_dispatcher(
-        _lhs.column(i).type(),
-        element_comparator{
-          _check_nulls, _lhs.column(i), _rhs.column(i), null_precedence, depth, _comparator},
-        lhs_index,
-        rhs_index);
+      cuda::std::tie(state, last_null_depth) =
+        cudf::type_dispatcher(_lhs.column(i).type(), element_comp, lhs_index, rhs_index);
 
       if (state == weak_ordering::EQUIVALENT) { continue; }
 
@@ -426,6 +565,8 @@ class device_row_comparator {
  private:
   table_device_view const _lhs;
   table_device_view const _rhs;
+  device_span<detail::dremel_device_view const> const _l_dremel;
+  device_span<detail::dremel_device_view const> const _r_dremel;
   Nullate const _check_nulls;
   std::optional<device_span<int const>> const _depth;
   std::optional<device_span<order const>> const _column_order;
@@ -534,6 +675,41 @@ struct preprocessed_table {
   friend class self_comparator;       ///< Allow self_comparator to access private members
   friend class two_table_comparator;  ///< Allow two_table_comparator to access private members
 
+  /**
+   * @brief Construct a preprocessed table for use with lexicographical comparison
+   *
+   * Sets up the table for use with lexicographical comparison. The resulting preprocessed table can
+   * be passed to the constructor of `lexicographic::self_comparator` to avoid preprocessing again.
+   *
+   * @param table The table to preprocess
+   * @param column_order Optional, device array the same length as a row that indicates the desired
+   * ascending/descending order of each column in a row. If empty, it is assumed all columns are
+   * sorted in ascending order.
+   * @param null_precedence Optional, device array the same length as a row and indicates how null
+   * values compare to all other for every column. If it is nullptr, then null precedence would be
+   * `null_order::BEFORE` for all columns.
+   * @param depths The depths of each column resulting from decomposing struct columns.
+   * @param dremel_data The dremel data for each list column. The length of this object is the
+   * number of list columns in the table.
+   * @param dremel_device_views Device views into the dremel_data structs contained in the
+   * `dremel_data` parameter. For columns that are not list columns, this uvector will should
+   * contain an empty `dremel_device_view`. As such, this uvector has as many elements as there are
+   * columns in the table (unlike the `dremel_data` parameter, which is only as long as the number
+   * of list columns).
+   */
+  preprocessed_table(table_device_view_owner&& table,
+                     rmm::device_uvector<order>&& column_order,
+                     rmm::device_uvector<null_order>&& null_precedence,
+                     rmm::device_uvector<size_type>&& depths,
+                     std::vector<detail::dremel_data>&& dremel_data,
+                     rmm::device_uvector<detail::dremel_device_view>&& dremel_device_views)
+    : _t(std::move(table)),
+      _column_order(std::move(column_order)),
+      _null_precedence(std::move(null_precedence)),
+      _depths(std::move(depths)),
+      _dremel_data(std::move(dremel_data)),
+      _dremel_device_views(std::move(dremel_device_views)){};
+
   preprocessed_table(table_device_view_owner&& table,
                      rmm::device_uvector<order>&& column_order,
                      rmm::device_uvector<null_order>&& null_precedence,
@@ -541,7 +717,9 @@ struct preprocessed_table {
     : _t(std::move(table)),
       _column_order(std::move(column_order)),
       _null_precedence(std::move(null_precedence)),
-      _depths(std::move(depths)){};
+      _depths(std::move(depths)),
+      _dremel_data{},
+      _dremel_device_views{} {};
 
   /**
    * @brief Implicit conversion operator to a `table_device_view` of the preprocessed table.
@@ -590,11 +768,24 @@ struct preprocessed_table {
     return _depths.size() ? std::optional<device_span<int const>>(_depths) : std::nullopt;
   }
 
+  [[nodiscard]] device_span<detail::dremel_device_view const> dremel_device_views() const
+  {
+    if (_dremel_device_views.has_value()) {
+      return device_span<detail::dremel_device_view const>(*_dremel_device_views);
+    } else {
+      return {};
+    }
+  }
+
  private:
   table_device_view_owner const _t;
   rmm::device_uvector<order> const _column_order;
   rmm::device_uvector<null_order> const _null_precedence;
   rmm::device_uvector<size_type> const _depths;
+
+  // Dremel encoding of list columns used for the comparison algorithm
+  std::optional<std::vector<detail::dremel_data>> _dremel_data;
+  std::optional<rmm::device_uvector<detail::dremel_device_view>> _dremel_device_views;
 };
 
 /**
@@ -660,22 +851,42 @@ class self_comparator {
    * @param comparator Physical element relational comparison functor.
    * @return A binary callable object.
    */
-  template <typename Nullate,
+  template <bool has_nested_columns,
+            typename Nullate,
             typename PhysicalElementComparator = sorting_physical_element_comparator>
   auto less(Nullate nullate = {}, PhysicalElementComparator comparator = {}) const noexcept
   {
-    return less_comparator{device_row_comparator{
-      nullate, *d_t, *d_t, d_t->depths(), d_t->column_order(), d_t->null_precedence(), comparator}};
+    return less_comparator{
+      device_row_comparator<has_nested_columns, Nullate, PhysicalElementComparator>{
+        nullate,
+        *d_t,
+        *d_t,
+        d_t->dremel_device_views(),
+        d_t->dremel_device_views(),
+        d_t->depths(),
+        d_t->column_order(),
+        d_t->null_precedence(),
+        comparator}};
   }
 
   /// @copydoc less()
-  template <typename Nullate,
+  template <bool has_nested_columns,
+            typename Nullate,
             typename PhysicalElementComparator = sorting_physical_element_comparator>
   auto less_equivalent(Nullate nullate                      = {},
                        PhysicalElementComparator comparator = {}) const noexcept
   {
-    return less_equivalent_comparator{device_row_comparator{
-      nullate, *d_t, *d_t, d_t->depths(), d_t->column_order(), d_t->null_precedence(), comparator}};
+    return less_equivalent_comparator{
+      device_row_comparator<has_nested_columns, Nullate, PhysicalElementComparator>{
+        nullate,
+        *d_t,
+        *d_t,
+        d_t->dremel_device_views(),
+        d_t->dremel_device_views(),
+        d_t->depths(),
+        d_t->column_order(),
+        d_t->null_precedence(),
+        comparator}};
   }
 
  private:
@@ -792,34 +1003,42 @@ class two_table_comparator {
    * @param comparator Physical element relational comparison functor.
    * @return A binary callable object.
    */
-  template <typename Nullate,
+  template <bool has_nested_columns,
+            typename Nullate,
             typename PhysicalElementComparator = sorting_physical_element_comparator>
   auto less(Nullate nullate = {}, PhysicalElementComparator comparator = {}) const noexcept
   {
-    return less_comparator{
-      strong_index_comparator_adapter{device_row_comparator{nullate,
-                                                            *d_left_table,
-                                                            *d_right_table,
-                                                            d_left_table->depths(),
-                                                            d_left_table->column_order(),
-                                                            d_left_table->null_precedence(),
-                                                            comparator}}};
+    return less_comparator{strong_index_comparator_adapter{
+      device_row_comparator<has_nested_columns, Nullate, PhysicalElementComparator>{
+        nullate,
+        *d_left_table,
+        *d_right_table,
+        d_left_table->dremel_device_views(),
+        d_right_table->dremel_device_views(),
+        d_left_table->depths(),
+        d_left_table->column_order(),
+        d_left_table->null_precedence(),
+        comparator}}};
   }
 
   /// @copydoc less()
-  template <typename Nullate,
+  template <bool has_nested_columns,
+            typename Nullate,
             typename PhysicalElementComparator = sorting_physical_element_comparator>
   auto less_equivalent(Nullate nullate                      = {},
                        PhysicalElementComparator comparator = {}) const noexcept
   {
-    return less_equivalent_comparator{
-      strong_index_comparator_adapter{device_row_comparator{nullate,
-                                                            *d_left_table,
-                                                            *d_right_table,
-                                                            d_left_table->depths(),
-                                                            d_left_table->column_order(),
-                                                            d_left_table->null_precedence(),
-                                                            comparator}}};
+    return less_equivalent_comparator{strong_index_comparator_adapter{
+      device_row_comparator<has_nested_columns, Nullate, PhysicalElementComparator>{
+        nullate,
+        *d_left_table,
+        *d_right_table,
+        d_left_table->dremel_device_views(),
+        d_right_table->dremel_device_views(),
+        d_left_table->depths(),
+        d_left_table->column_order(),
+        d_left_table->null_precedence(),
+        comparator}}};
   }
 
  private:
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 4d0aee292f6..8b520714b34 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -166,6 +166,14 @@ class table_view_base {
    */
   table_view_base& operator=(table_view_base&&) = default;
 };
+
+/**
+ * @brief Determine if any nested columns exist in a given table.
+ *
+ * @param table The input table
+ * @return Whether nested columns exist in the input table
+ */
+bool has_nested_columns(table_view const& table);
 }  // namespace detail
 
 /**
diff --git a/cpp/src/binaryop/compiled/struct_binary_ops.cuh b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
index 804b931fa5b..def9ebcef97 100644
--- a/cpp/src/binaryop/compiled/struct_binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
@@ -93,9 +93,17 @@ void apply_struct_binary_op(mutable_column_view& out,
       out.end<bool>(),
       device_comparison_functor{optional_iter, is_lhs_scalar, is_rhs_scalar, device_comparator});
   };
-  is_any_v<BinaryOperator, ops::LessEqual, ops::GreaterEqual>
-    ? tabulate_device_operator(table_comparator.less_equivalent(comparator_nulls, comparator))
-    : tabulate_device_operator(table_comparator.less(comparator_nulls, comparator));
+  if (cudf::detail::has_nested_columns(tlhs) || cudf::detail::has_nested_columns(trhs)) {
+    is_any_v<BinaryOperator, ops::LessEqual, ops::GreaterEqual>
+      ? tabulate_device_operator(
+          table_comparator.less_equivalent<true>(comparator_nulls, comparator))
+      : tabulate_device_operator(table_comparator.less<true>(comparator_nulls, comparator));
+  } else {
+    is_any_v<BinaryOperator, ops::LessEqual, ops::GreaterEqual>
+      ? tabulate_device_operator(
+          table_comparator.less_equivalent<false>(comparator_nulls, comparator))
+      : tabulate_device_operator(table_comparator.less<false>(comparator_nulls, comparator));
+  }
 }
 
 template <typename PhysicalEqualityComparator =
diff --git a/cpp/src/search/search_ordered.cu b/cpp/src/search/search_ordered.cu
index e6bcdcedf64..8d3b0f97726 100644
--- a/cpp/src/search/search_ordered.cu
+++ b/cpp/src/search/search_ordered.cu
@@ -68,28 +68,49 @@ std::unique_ptr<column> search_ordered(table_view const& haystack,
 
   auto const comparator = cudf::experimental::row::lexicographic::two_table_comparator(
     matched_haystack, matched_needles, column_order, null_precedence, stream);
-  auto const has_nulls    = has_nested_nulls(matched_haystack) or has_nested_nulls(matched_needles);
-  auto const d_comparator = comparator.less(nullate::DYNAMIC{has_nulls});
+  auto const has_nulls = has_nested_nulls(matched_haystack) or has_nested_nulls(matched_needles);
 
   auto const haystack_it = cudf::experimental::row::lhs_iterator(0);
   auto const needles_it  = cudf::experimental::row::rhs_iterator(0);
 
-  if (find_first) {
-    thrust::lower_bound(rmm::exec_policy(stream),
-                        haystack_it,
-                        haystack_it + haystack.num_rows(),
-                        needles_it,
-                        needles_it + needles.num_rows(),
-                        out_it,
-                        d_comparator);
+  if (cudf::detail::has_nested_columns(haystack) || cudf::detail::has_nested_columns(needles)) {
+    auto const d_comparator = comparator.less<true>(nullate::DYNAMIC{has_nulls});
+    if (find_first) {
+      thrust::lower_bound(rmm::exec_policy(stream),
+                          haystack_it,
+                          haystack_it + haystack.num_rows(),
+                          needles_it,
+                          needles_it + needles.num_rows(),
+                          out_it,
+                          d_comparator);
+    } else {
+      thrust::upper_bound(rmm::exec_policy(stream),
+                          haystack_it,
+                          haystack_it + haystack.num_rows(),
+                          needles_it,
+                          needles_it + needles.num_rows(),
+                          out_it,
+                          d_comparator);
+    }
   } else {
-    thrust::upper_bound(rmm::exec_policy(stream),
-                        haystack_it,
-                        haystack_it + haystack.num_rows(),
-                        needles_it,
-                        needles_it + needles.num_rows(),
-                        out_it,
-                        d_comparator);
+    auto const d_comparator = comparator.less<false>(nullate::DYNAMIC{has_nulls});
+    if (find_first) {
+      thrust::lower_bound(rmm::exec_policy(stream),
+                          haystack_it,
+                          haystack_it + haystack.num_rows(),
+                          needles_it,
+                          needles_it + needles.num_rows(),
+                          out_it,
+                          d_comparator);
+    } else {
+      thrust::upper_bound(rmm::exec_policy(stream),
+                          haystack_it,
+                          haystack_it + haystack.num_rows(),
+                          needles_it,
+                          needles_it + needles.num_rows(),
+                          out_it,
+                          d_comparator);
+    }
   }
   return result;
 }
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index f98fda307b8..97fc8ac14cb 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -127,18 +127,32 @@ std::unique_ptr<column> sorted_order(table_view input,
 
   auto comp =
     experimental::row::lexicographic::self_comparator(input, column_order, null_precedence, stream);
-  auto comparator = comp.less(nullate::DYNAMIC{has_nested_nulls(input)});
-
-  if (stable) {
-    thrust::stable_sort(rmm::exec_policy(stream),
-                        mutable_indices_view.begin<size_type>(),
-                        mutable_indices_view.end<size_type>(),
-                        comparator);
+  if (cudf::detail::has_nested_columns(input)) {
+    auto comparator = comp.less<true>(nullate::DYNAMIC{has_nested_nulls(input)});
+    if (stable) {
+      thrust::stable_sort(rmm::exec_policy(stream),
+                          mutable_indices_view.begin<size_type>(),
+                          mutable_indices_view.end<size_type>(),
+                          comparator);
+    } else {
+      thrust::sort(rmm::exec_policy(stream),
+                   mutable_indices_view.begin<size_type>(),
+                   mutable_indices_view.end<size_type>(),
+                   comparator);
+    }
   } else {
-    thrust::sort(rmm::exec_policy(stream),
-                 mutable_indices_view.begin<size_type>(),
-                 mutable_indices_view.end<size_type>(),
-                 comparator);
+    auto comparator = comp.less<false>(nullate::DYNAMIC{has_nested_nulls(input)});
+    if (stable) {
+      thrust::stable_sort(rmm::exec_policy(stream),
+                          mutable_indices_view.begin<size_type>(),
+                          mutable_indices_view.end<size_type>(),
+                          comparator);
+    } else {
+      thrust::sort(rmm::exec_policy(stream),
+                   mutable_indices_view.begin<size_type>(),
+                   mutable_indices_view.end<size_type>(),
+                   comparator);
+    }
   }
   // protection for temporary d_column_order and d_null_precedence
   stream.synchronize();
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index af88d6776a4..05e8860d63d 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -255,6 +255,24 @@ auto decompose_structs(table_view table,
                          std::move(verticalized_col_depths));
 }
 
+/*
+ * This helper function generates dremel data for any list-type columns in a
+ * table. This data is necessary for lexicographic comparisons.
+ */
+auto list_lex_preprocess(table_view table, rmm::cuda_stream_view stream)
+{
+  std::vector<detail::dremel_data> dremel_data;
+  std::vector<detail::dremel_device_view> dremel_device_views;
+  for (auto const& col : table) {
+    if (col.type().id() == type_id::LIST) {
+      dremel_data.push_back(detail::get_dremel_data(col, {}, false, stream));
+      dremel_device_views.push_back(dremel_data.back());
+    }
+  }
+  auto d_dremel_device_views = detail::make_device_uvector_sync(dremel_device_views, stream);
+  return std::make_tuple(std::move(dremel_data), std::move(d_dremel_device_views));
+}
+
 using column_checker_fn_t = std::function<void(column_view const&)>;
 
 /**
@@ -264,18 +282,25 @@ using column_checker_fn_t = std::function<void(column_view const&)>;
  */
 void check_lex_compatibility(table_view const& input)
 {
-  // Basically check if there's any LIST hiding anywhere in the table
+  // Basically check if there's any LIST of STRUCT or STRUCT of LIST hiding anywhere in the table
   column_checker_fn_t check_column = [&](column_view const& c) {
-    CUDF_EXPECTS(c.type().id() != type_id::LIST,
-                 "Cannot lexicographic compare a table with a LIST column");
+    if (c.type().id() == type_id::LIST) {
+      auto const& list_col = lists_column_view(c);
+      CUDF_EXPECTS(list_col.child().type().id() != type_id::STRUCT,
+                   "Cannot lexicographic compare a table with a LIST of STRUCT column");
+      check_column(list_col.child());
+    } else if (c.type().id() == type_id::STRUCT) {
+      for (auto child = c.child_begin(); child < c.child_end(); ++child) {
+        CUDF_EXPECTS(child->type().id() != type_id::LIST,
+                     "Cannot lexicographic compare a table with a STRUCT of LIST column");
+        check_column(*child);
+      }
+    }
     if (not is_nested(c.type())) {
       CUDF_EXPECTS(is_relationally_comparable(c.type()),
                    "Cannot lexicographic compare a table with a column of type " +
                      jit::get_type_name(c.type()));
     }
-    for (auto child = c.child_begin(); child < c.child_end(); ++child) {
-      check_column(*child);
-    }
   };
   for (column_view const& c : input) {
     check_column(c);
@@ -336,8 +361,21 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(
   auto d_null_precedence = detail::make_device_uvector_async(new_null_precedence, stream);
   auto d_depths          = detail::make_device_uvector_async(verticalized_col_depths, stream);
 
-  return std::shared_ptr<preprocessed_table>(new preprocessed_table(
-    std::move(d_t), std::move(d_column_order), std::move(d_null_precedence), std::move(d_depths)));
+  if (detail::has_nested_columns(t)) {
+    auto [dremel_data, d_dremel_device_view] = list_lex_preprocess(verticalized_lhs, stream);
+    return std::shared_ptr<preprocessed_table>(
+      new preprocessed_table(std::move(d_t),
+                             std::move(d_column_order),
+                             std::move(d_null_precedence),
+                             std::move(d_depths),
+                             std::move(dremel_data),
+                             std::move(d_dremel_device_view)));
+  } else {
+    return std::shared_ptr<preprocessed_table>(new preprocessed_table(std::move(d_t),
+                                                                      std::move(d_column_order),
+                                                                      std::move(d_null_precedence),
+                                                                      std::move(d_depths)));
+  }
 }
 
 two_table_comparator::two_table_comparator(table_view const& left,
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index a413c8fe65b..0d1cabfd4f6 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -134,5 +134,11 @@ template bool is_relationally_comparable<table_view>(table_view const& lhs, tabl
 template bool is_relationally_comparable<mutable_table_view>(mutable_table_view const& lhs,
                                                              mutable_table_view const& rhs);
 
+bool has_nested_columns(table_view const& table)
+{
+  return std::any_of(
+    table.begin(), table.end(), [](column_view const& col) { return is_nested(col.type()); });
+}
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp
index a0af8f150e3..9dc13b2f9f7 100644
--- a/cpp/tests/rolling/collect_ops_test.cpp
+++ b/cpp/tests/rolling/collect_ops_test.cpp
@@ -2275,10 +2275,23 @@ TEST_F(CollectSetTest, ListTypeRollingWindow)
   auto const prev_column = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
   auto const foll_column = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
 
-  EXPECT_THROW(rolling_collect_set(input_column,
-                                   prev_column,
-                                   foll_column,
-                                   1,
-                                   *make_collect_set_aggregation<rolling_aggregation>()),
-               cudf::logic_error);
+  auto const expected = [] {
+    auto data = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 1, 2, 3, 4,  5, 6, 4, 5,
+                                                    6, 7, 8, 9, 6, 7, 8, 9, 10, 7, 8, 9, 10};
+    auto inner_offsets =
+      fixed_width_column_wrapper<int32_t>{0, 3, 5, 8, 10, 11, 13, 14, 17, 18, 21, 22, 25, 26};
+    auto outer_offsets = fixed_width_column_wrapper<size_type>{0, 2, 5, 8, 11, 13};
+
+    auto inner_list = cudf::make_lists_column(13, inner_offsets.release(), data.release(), 0, {});
+
+    return cudf::make_lists_column(5, outer_offsets.release(), std::move(inner_list), 0, {});
+  }();
+
+  auto const result = rolling_collect_set(input_column,
+                                          prev_column,
+                                          foll_column,
+                                          1,
+                                          *make_collect_set_aggregation<rolling_aggregation>());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected->view(), result->view());
 }
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index 1dd7e21b821..4092597d8e3 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -740,16 +741,100 @@ TYPED_TEST(Sort, ZeroSizedColumns)
 
 TYPED_TEST(Sort, WithListColumn)
 {
-  using T = int;
-  lists_column_wrapper<T> lc{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
-  CUDF_EXPECT_THROW_MESSAGE(cudf::sort(table_view({lc})),
-                            "Cannot lexicographic compare a table with a LIST column");
-
-  std::vector<std::unique_ptr<cudf::column>> child_cols;
-  child_cols.push_back(lc.release());
-  structs_column_wrapper sc{std::move(child_cols), {1, 0, 1}};
-  CUDF_EXPECT_THROW_MESSAGE(cudf::sort(table_view({sc})),
-                            "Cannot lexicographic compare a table with a LIST column");
+  using T = TypeParam;
+  if (std::is_same_v<T, bool>) { GTEST_SKIP(); }
+
+  using lcw = cudf::test::lists_column_wrapper<T, int32_t>;
+  lcw col{
+    {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}},
+    {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}},
+    {{1, 2, 3}, {}, {4, 5}, {0, 6, 0}},
+    {{1, 2}, {3}, {4, 5}, {0, 6, 0}},
+    {{7, 8}, {}},
+    lcw{lcw{}, lcw{}, lcw{}},
+    lcw{lcw{}},
+    {lcw{10}},
+    lcw{},
+  };
+
+  auto expect = cudf::test::fixed_width_column_wrapper<cudf::size_type>{8, 6, 5, 3, 0, 1, 2, 4, 7};
+  auto result = cudf::sorted_order(cudf::table_view({col}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, *result);
+}
+
+TYPED_TEST(Sort, WithNullableListColumn)
+{
+  using T = TypeParam;
+  if (std::is_same_v<T, bool>) { GTEST_SKIP(); }
+
+  using lcw = cudf::test::lists_column_wrapper<T, int32_t>;
+  using cudf::test::iterators::nulls_at;
+  lcw col{
+    {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}},                   // 0
+    {{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, nulls_at({3})},  // 1
+    {{1, 2, 3}, {}, {4, 5}, {0, 6, 0}},                       // 2
+    {{1, 2}, {3}, {4, 5}, {0, 6, 0}},                         // 3
+    {{1, 2}, {3}, {4, 5}, {{0, 6, 0}, nulls_at({0})}},        // 4
+    {{7, 8}, {}},                                             // 5
+    lcw{lcw{}, lcw{}, lcw{}},                                 // 6
+    lcw{lcw{}},                                               // 7
+    {lcw{10}},                                                // 8
+    lcw{},                                                    // 9
+    {{1, 2}, {3}, {4, 5}, {{0, 6, 0}, nulls_at({0, 2})}},     // 10
+    {{1, 2}, {3}, {4, 5}, {{0, 7}, nulls_at({0})}},           // 11
+  };
+
+  auto expect =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{9, 7, 6, 10, 4, 11, 3, 1, 0, 2, 5, 8};
+  auto result = cudf::sorted_order(cudf::table_view({col}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, *result);
+}
+
+TYPED_TEST(Sort, WithSlicedListColumn)
+{
+  using T = TypeParam;
+  if (std::is_same_v<T, bool>) { GTEST_SKIP(); }
+
+  using lcw = cudf::test::lists_column_wrapper<T, int32_t>;
+  using cudf::test::iterators::nulls_at;
+  lcw col{
+    {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}},                   //
+    {{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, nulls_at({3})},  // 0
+    {{1, 2, 3}, {}, {4, 5}, {0, 6, 0}},                       // 1
+    {{1, 2}, {3}, {4, 5}, {0, 6, 0}},                         // 2
+    {{1, 2}, {3}, {4, 5}, {{0, 6, 0}, nulls_at({0})}},        // 3
+    {{7, 8}, {}},                                             // 4
+    lcw{lcw{}, lcw{}, lcw{}},                                 // 5
+    lcw{lcw{}},                                               // 6
+    {lcw{10}},                                                // 7
+    lcw{},                                                    // 8
+    {{1, 2}, {3}, {4, 5}, {{0, 6, 0}, nulls_at({0, 2})}},     // 9
+    {{1, 2}, {3}, {4, 5}, {{0, 7}, nulls_at({0})}},           //
+  };
+
+  auto sliced_col = cudf::slice(col, {1, 10});
+
+  auto expect = cudf::test::fixed_width_column_wrapper<cudf::size_type>{8, 6, 5, 3, 2, 0, 1, 4, 7};
+  auto result = cudf::sorted_order(cudf::table_view({sliced_col}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, *result);
+}
+
+TYPED_TEST(Sort, WithEmptyListColumn)
+{
+  using T = TypeParam;
+  if (std::is_same_v<T, bool>) { GTEST_SKIP(); }
+
+  auto L1 = cudf::make_lists_column(0,
+                                    cudf::make_empty_column(cudf::data_type(cudf::type_id::INT32)),
+                                    cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64}),
+                                    0,
+                                    {});
+  auto L0 = cudf::make_lists_column(
+    3, cudf::test::fixed_width_column_wrapper<int32_t>{0, 0, 0, 0}.release(), std::move(L1), 0, {});
+
+  auto expect = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2};
+  auto result = cudf::sorted_order(cudf::table_view({*L0}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, *result);
 }
 
 struct SortByKey : public BaseFixture {
diff --git a/cpp/tests/table/experimental_row_operator_tests.cu b/cpp/tests/table/experimental_row_operator_tests.cu
index db5a064b1c2..0566f55e46d 100644
--- a/cpp/tests/table/experimental_row_operator_tests.cu
+++ b/cpp/tests/table/experimental_row_operator_tests.cu
@@ -54,17 +54,25 @@ auto self_comparison(cudf::table_view input,
   rmm::cuda_stream_view stream{cudf::default_stream_value};
 
   auto const table_comparator = lexicographic::self_comparator{input, column_order, {}, stream};
-  auto const less_comparator  = table_comparator.less(cudf::nullate::NO{}, comparator);
 
   auto output = cudf::make_numeric_column(
     cudf::data_type(cudf::type_id::BOOL8), input.num_rows(), cudf::mask_state::UNALLOCATED);
 
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator(0),
-                    thrust::make_counting_iterator(input.num_rows()),
-                    thrust::make_counting_iterator(0),
-                    output->mutable_view().data<bool>(),
-                    less_comparator);
+  if (cudf::detail::has_nested_columns(input)) {
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator(0),
+                      thrust::make_counting_iterator(input.num_rows()),
+                      thrust::make_counting_iterator(0),
+                      output->mutable_view().data<bool>(),
+                      table_comparator.less<true>(cudf::nullate::NO{}, comparator));
+  } else {
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator(0),
+                      thrust::make_counting_iterator(input.num_rows()),
+                      thrust::make_counting_iterator(0),
+                      output->mutable_view().data<bool>(),
+                      table_comparator.less<false>(cudf::nullate::NO{}, comparator));
+  }
   return output;
 }
 
@@ -78,19 +86,27 @@ auto two_table_comparison(cudf::table_view lhs,
 
   auto const table_comparator =
     lexicographic::two_table_comparator{lhs, rhs, column_order, {}, stream};
-  auto const less_comparator = table_comparator.less(cudf::nullate::NO{}, comparator);
-  auto const lhs_it          = cudf::experimental::row::lhs_iterator(0);
-  auto const rhs_it          = cudf::experimental::row::rhs_iterator(0);
+  auto const lhs_it = cudf::experimental::row::lhs_iterator(0);
+  auto const rhs_it = cudf::experimental::row::rhs_iterator(0);
 
   auto output = cudf::make_numeric_column(
     cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED);
 
-  thrust::transform(rmm::exec_policy(stream),
-                    lhs_it,
-                    lhs_it + lhs.num_rows(),
-                    rhs_it,
-                    output->mutable_view().data<bool>(),
-                    less_comparator);
+  if (cudf::detail::has_nested_columns(lhs) || cudf::detail::has_nested_columns(rhs)) {
+    thrust::transform(rmm::exec_policy(stream),
+                      lhs_it,
+                      lhs_it + lhs.num_rows(),
+                      rhs_it,
+                      output->mutable_view().data<bool>(),
+                      table_comparator.less<true>(cudf::nullate::NO{}, comparator));
+  } else {
+    thrust::transform(rmm::exec_policy(stream),
+                      lhs_it,
+                      lhs_it + lhs.num_rows(),
+                      rhs_it,
+                      output->mutable_view().data<bool>(),
+                      table_comparator.less<false>(cudf::nullate::NO{}, comparator));
+  }
   return output;
 }
 

From 44d4e3181d9ac3e04b34bc87ef456cdfc9d6ec7e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 12 Sep 2022 11:59:53 -0500
Subject: [PATCH 08/25] Ignore protobuf generated files in `mypy` checks
 (#11685)

As part of https://github.com/rapidsai/cudf/pull/11640, there was a missing exclusion for `cudf/cudf/utils/metadata/orc_column_statistics_pb2.py`, that causes the following errors:
```bash
(cudfdev) pgali@dt07:/nvme/0/pgali/cudf$ git commit -m "address reviews"
[WARNING] Unstaged files detected.
[INFO] Stashing unstaged files to /home/nfs/pgali/.cache/pre-commit/patch1662993629-63423.
isort....................................................................Passed
black....................................................................Passed
flake8...................................................................Passed
mypy.....................................................................Failed
- hook id: mypy
- exit code: 1

python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:7: error: Library stubs not installed for "google.protobuf.internal" (or incompatible with Python 3.9)
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:7: note: Hint: "python3 -m pip install types-protobuf"
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:7: note: (or run "mypy --install-types" to install all missing stub packages)
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:7: note: See https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:8: error: Library stubs not installed for "google.protobuf" (or incompatible with Python 3.9)
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:25: error: Name "_BUCKETSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:26: error: Name "_BUCKETSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:27: error: Name "_INTEGERSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:28: error: Name "_INTEGERSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:29: error: Name "_DOUBLESTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:30: error: Name "_DOUBLESTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:31: error: Name "_STRINGSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:32: error: Name "_STRINGSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:33: error: Name "_BUCKETSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:34: error: Name "_BUCKETSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:35: error: Name "_DECIMALSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:36: error: Name "_DECIMALSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:37: error: Name "_DATESTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:38: error: Name "_DATESTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:39: error: Name "_TIMESTAMPSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:40: error: Name "_TIMESTAMPSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:41: error: Name "_BINARYSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:42: error: Name "_BINARYSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:43: error: Name "_COLUMNSTATISTICS" is not defined
python/cudf/cudf/utils/metadata/orc_column_statistics_pb2.py:44: error: Name "_COLUMNSTATISTICS" is not defined
Found 22 errors in 1 file (checked 138 source files)

pydocstyle...............................................................Passed
clang-format.........................................(no files to check)Skipped
no-deprecationwarning....................................................Passed
cmake-format.........................................(no files to check)Skipped
- hook id: cmake-format
cmake-lint...........................................(no files to check)Skipped
- hook id: cmake-lint
copyright-check..........................................................Passed
doxygen-check........................................(no files to check)Skipped
- hook id: doxygen-check
[INFO] Restored changes from /home/nfs/pgali/.cache/pre-commit/patch1662993629-63423.
```

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/11685
---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index 182fb1e7805..d196e8605b2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -45,6 +45,7 @@ exclude = (?x)(
   | cudf/_lib/
   | cudf/cudf/benchmarks/
   | cudf/cudf/tests/
+  | cudf/cudf/utils/metadata/orc_column_statistics_pb2.py
   | custreamz/custreamz/tests/
   | dask_cudf/dask_cudf/tests/
   # This close paren cannot be in column zero otherwise the config parser barfs

From 866434f8f1424c84d99f823a2658032a88e9cee7 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 12 Sep 2022 12:23:14 -0500
Subject: [PATCH 09/25] Fix issue with extracting nested column data & dtype
 preservation (#11671)

This PR:
Fixes: #11670
- [x] Fixes: https://github.com/rapidsai/cudf/issues/11670, by correctly generating the `column_metadata` for nested scenarios.
- [x] Also fixes an issue with dtype mismatch after updating `children` in a `ListColumn`. See the pytest below.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/11671
---
 python/cudf/cudf/_lib/interop.pyx      | 43 +++++++++++++++++---------
 python/cudf/cudf/_lib/scalar.pyx       | 11 ++-----
 python/cudf/cudf/core/column/column.py |  2 +-
 python/cudf/cudf/core/column/lists.py  |  7 ++++-
 python/cudf/cudf/tests/test_list.py    |  1 +
 python/cudf/cudf/tests/test_parquet.py | 23 ++++++++++++++
 6 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index dece726270d..ee5ce165f95 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -87,56 +87,71 @@ cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj):
     dlpack_tensor.deleter(dlpack_tensor)
 
 
-cdef vector[column_metadata] gather_metadata(dict cols_dtypes) except *:
+cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *:
     """
     Generates a column_metadata vector for each column.
 
     Parameters
     ----------
-    cols_dtypes : dict
-        A dict mapping of column names & their dtypes.
+    cols_dtypes : iterable
+        An iterable of ``(column_name, dtype)`` pairs.
     """
     cdef vector[column_metadata] cpp_metadata
     cpp_metadata.reserve(len(cols_dtypes))
 
     if cols_dtypes is not None:
-        for idx, (col_name, col_dtype) in enumerate(cols_dtypes.items()):
+        for idx, (col_name, col_dtype) in enumerate(cols_dtypes):
             cpp_metadata.push_back(column_metadata(col_name.encode()))
-            if is_struct_dtype(col_dtype):
+            if is_struct_dtype(col_dtype) or is_list_dtype(col_dtype):
                 _set_col_children_metadata(col_dtype, cpp_metadata[idx])
     else:
         raise TypeError(
-            "A dictionary of column names and dtypes is required to "
+            "An iterable of (column_name, dtype) pairs is required to "
             "construct column_metadata"
         )
     return cpp_metadata
 
 cdef _set_col_children_metadata(dtype,
                                 column_metadata& col_meta):
+
+    cdef column_metadata element_metadata
+
     if is_struct_dtype(dtype):
-        col_meta.children_meta.reserve(len(dtype.fields))
-        for i, name in enumerate(dtype.fields):
-            value = dtype.fields[name]
-            col_meta.children_meta.push_back(column_metadata(name.encode()))
+        for name, value in dtype.fields.items():
+            element_metadata = column_metadata(name.encode())
             _set_col_children_metadata(
-                value, col_meta.children_meta[i]
+                value, element_metadata
             )
+            col_meta.children_meta.push_back(element_metadata)
+    elif is_list_dtype(dtype):
+        col_meta.children_meta.reserve(2)
+        # Offsets - child 0
+        col_meta.children_meta.push_back(column_metadata())
+
+        # Element column - child 1
+        element_metadata = column_metadata()
+        _set_col_children_metadata(
+            dtype.element_type, element_metadata
+        )
+        col_meta.children_meta.push_back(element_metadata)
+    else:
+        col_meta.children_meta.push_back(column_metadata())
 
 
-def to_arrow(list source_columns, dict cols_dtypes):
+def to_arrow(list source_columns, object column_dtypes):
     """Convert a list of columns from
     cudf Frame to a PyArrow Table.
 
     Parameters
     ----------
     source_columns : a list of columns to convert
-    cols_dtype : A dict mapping of column names & their dtypes.
+    column_dtypes : Iterable of ``(column_name, column_dtype)`` pairs
 
     Returns
     -------
     pyarrow table
     """
-    cdef vector[column_metadata] cpp_metadata = gather_metadata(cols_dtypes)
+    cdef vector[column_metadata] cpp_metadata = gather_metadata(column_dtypes)
     cdef table_view input_table_view = table_view_from_columns(source_columns)
 
     cdef shared_ptr[CTable] cpp_arrow_table
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index e73e994a73d..9b422b77eeb 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -396,7 +396,7 @@ cdef _get_py_dict_from_struct(unique_ptr[scalar]& s, dtype):
         children=tuple(columns),
         size=1,
     )
-    table = to_arrow([struct_col], {"None": dtype})
+    table = to_arrow([struct_col], [("None", dtype)])
     python_dict = table.to_pydict()["None"][0]
     return {k: _nested_na_replace([python_dict[k]])[0] for k in python_dict}
 
@@ -428,14 +428,7 @@ cdef _get_py_list_from_list(unique_ptr[scalar]& s, dtype):
     cdef column_view list_col_view = (<list_scalar*>s.get()).view()
     cdef Column element_col = Column.from_column_view(list_col_view, None)
 
-    arrow_obj = to_arrow(
-        [element_col],
-        {
-            "None": dtype.element_type
-            if isinstance(element_col, cudf.core.column.StructColumn)
-            else dtype
-        }
-    )["None"]
+    arrow_obj = to_arrow([element_col], [("None", dtype.element_type)])["None"]
 
     result = arrow_obj.to_pylist()
     return _nested_na_replace(result)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 1be0190c94f..4fe365768ef 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -233,7 +233,7 @@ def to_arrow(self) -> pa.Array:
           4
         ]
         """
-        return libcudf.interop.to_arrow([self], {"None": self.dtype})[
+        return libcudf.interop.to_arrow([self], [("None", self.dtype)])[
             "None"
         ].chunk(0)
 
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 0d5b351f69e..cf5831465a4 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from functools import cached_property
-from typing import List, Optional, Sequence, Union
+from typing import List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import pyarrow as pa
@@ -164,6 +164,11 @@ def set_base_data(self, value):
         else:
             super().set_base_data(value)
 
+    def set_base_children(self, value: Tuple[ColumnBase, ...]):
+        super().set_base_children(value)
+        _, values = value
+        self._dtype = cudf.ListDtype(element_type=values.dtype)
+
     @property
     def __cuda_array_interface__(self):
         raise NotImplementedError(
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index a321d2b430a..aa4e5393e5b 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -842,6 +842,7 @@ def test_memory_usage():
             0,
         ),
         ([[[[1, 2]], [[2], [3]]], [[[2]]], [[[3]]]], 2),
+        ([[[{"a": 1, "b": 2, "c": 10}]]], 0),
     ],
 )
 def test_nested_list_extract_host_scalars(data, idx):
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 0da7b6b14c9..84d89618909 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2545,3 +2545,26 @@ def test_parquet_columns_and_index_param(index, columns):
     got = cudf.read_parquet(buffer, columns=columns)
 
     assert_eq(expected, got, check_index_type=True)
+
+
+def test_parquet_nested_struct_list():
+    buffer = BytesIO()
+    data = {
+        "payload": {
+            "Domain": {
+                "Name": "abc",
+                "Id": {"Name": "host", "Value": "127.0.0.8"},
+            },
+            "StreamId": "12345678",
+            "Duration": 10,
+            "Offset": 12,
+            "Resource": [{"Name": "ZoneName", "Value": "RAPIDS"}],
+        }
+    }
+    df = cudf.DataFrame({"a": cudf.Series(data)})
+
+    df.to_parquet(buffer)
+    expected = pd.read_parquet(buffer)
+    actual = cudf.read_parquet(buffer)
+    assert_eq(expected, actual)
+    assert_eq(actual.a.dtype, df.a.dtype)

From dca285bd1d269ede566f1e71827ade39d1e72de6 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 12 Sep 2022 10:53:59 -0700
Subject: [PATCH 10/25] Check conda recipe headers with pre-commit (#11669)

This PR runs the conda recipe checks for all headers in `include/` with `pre-commit` instead of as a separate step in `style.sh`. This means that developers using `pre-commit` will be able to fix mistakes before pushing, where errors would cause failures in CI. Combined with #11668, most of our style check suite will be executed via `pre-commit`, enabling us to simplify `style.sh`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/11669
---
 .pre-commit-config.yaml   | 12 ++++++++++++
 ci/checks/headers_test.sh |  5 +----
 ci/checks/style.sh        |  7 +------
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ce6163755d7..a448b33a76d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -93,6 +93,18 @@ repos:
                 language: system
                 pass_filenames: false
                 verbose: true
+              - id: headers-recipe-check
+                name: headers-recipe-check
+                entry: ./ci/checks/headers_test.sh
+                files: |
+                  (?x)^(
+                    ^cpp/include/|
+                    ^conda/.*/meta.yaml
+                  )
+                types_or: [file]
+                language: system
+                pass_filenames: false
+                verbose: false
 
 default_language_version:
       python: python3
diff --git a/ci/checks/headers_test.sh b/ci/checks/headers_test.sh
index ebfc4b2965e..502bdca0fa7 100755
--- a/ci/checks/headers_test.sh
+++ b/ci/checks/headers_test.sh
@@ -16,12 +16,9 @@ for DIRNAME in ${DIRNAMES[@]}; do
     LIB_RETVAL=$?
 
     if [ "$LIB_RETVAL" != "0" ]; then
-        echo -e "\n\n>>>> FAILED: lib${LIBNAME} header existence conda/recipes/lib${LIBNAME}/meta.yaml check; begin output\n\n"
+        echo -e ">>>> FAILED: lib${LIBNAME} has different headers in include/${DIRNAME}/ and conda/recipes/lib${LIBNAME}/meta.yaml. The diff is shown below:"
         echo -e "$HEADER_DIFF"
-        echo -e "\n\n>>>> FAILED: lib${LIBNAME} header existence conda/recipes/lib${LIBNAME}/meta.yaml check; end output\n\n"
         RETVAL=1
-    else
-        echo -e "\n\n>>>> PASSED: lib${LIBNAME} header existence conda/recipes/lib${LIBNAME}/meta.yaml check\n\n"
     fi
 done
 
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 3c632e03219..27f34dc335e 100755
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -49,13 +49,8 @@ else
   echo -e "\n\n>>>> PASSED: clang format check\n\n"
 fi
 
-# Run header meta.yml check and get results/return code
-HEADER_META=`ci/checks/headers_test.sh`
-HEADER_META_RETVAL=$?
-echo -e "$HEADER_META"
-
 RETVALS=(
-  $CR_RETVAL $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL
+  $CR_RETVAL $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL
 )
 IFS=$'\n'
 RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`

From 39ad65fa205158a2a204b724f1a11e0e38f58637 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 12 Sep 2022 13:03:35 -0700
Subject: [PATCH 11/25] Remove redundant style check for clang-format. (#11668)

This PR removes a redundant style check for clang-format. Our configuration in `.pre-commit-config.yaml` already runs clang-format so we don't need a separate step for that purpose in `style.sh`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Jordan Jacobelli (https://github.com/Ethyling)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/11668
---
 .pre-commit-config.yaml             |  5 ++---
 ci/checks/style.sh                  | 20 ++++----------------
 cpp/.clang-format                   |  4 ++--
 cpp/src/column/column_factories.cpp |  2 +-
 4 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a448b33a76d..08e35fb47b0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -45,9 +45,8 @@ repos:
         rev: v11.1.0
         hooks:
               - id: clang-format
-                files: \.(cu|cuh|h|hpp|cpp|inl)$
-                types_or: [file]
-                args: ['-fallback-style=none', '-style=file', '-i']
+                types_or: [c, c++, cuda]
+                args: ["-fallback-style=none", "-style=file", "-i"]
       - repo: local
         hooks:
               - id: no-deprecationwarning
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 27f34dc335e..de3f8c01d83 100755
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -19,16 +19,16 @@ export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.
 mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
 
-
+# Run pre-commit checks
 pre-commit run --hook-stage manual --all-files
 PRE_COMMIT_RETVAL=$?
 
 # Check for copyright headers in the files modified currently
 COPYRIGHT=`python ci/checks/copyright.py --git-modified-only 2>&1`
-CR_RETVAL=$?
+COPYRIGHT_RETVAL=$?
 
 # Output results if failure otherwise show pass
-if [ "$CR_RETVAL" != "0" ]; then
+if [ "$COPYRIGHT_RETVAL" != "0" ]; then
   echo -e "\n\n>>>> FAILED: copyright check; begin output\n\n"
   echo -e "$COPYRIGHT"
   echo -e "\n\n>>>> FAILED: copyright check; end output\n\n"
@@ -37,20 +37,8 @@ else
   echo -e "$COPYRIGHT"
 fi
 
-# Run clang-format and check for a consistent code format
-CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1`
-CLANG_FORMAT_RETVAL=$?
-
-if [ "$CLANG_FORMAT_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: clang format check; begin output\n\n"
-  echo -e "$CLANG_FORMAT"
-  echo -e "\n\n>>>> FAILED: clang format check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: clang format check\n\n"
-fi
-
 RETVALS=(
-  $CR_RETVAL $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL
+  $PRE_COMMIT_RETVAL $COPYRIGHT_RETVAL
 )
 IFS=$'\n'
 RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`
diff --git a/cpp/.clang-format b/cpp/.clang-format
index 6019a6f3d5c..26b9a5bf4ce 100644
--- a/cpp/.clang-format
+++ b/cpp/.clang-format
@@ -15,7 +15,7 @@ AlignTrailingComments: true
 AllowAllArgumentsOnNextLine: true
 AllowAllConstructorInitializersOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: true 
+AllowShortBlocksOnASingleLine: true
 AllowShortCaseLabelsOnASingleLine: true
 AllowShortEnumsOnASingleLine: true
 AllowShortFunctionsOnASingleLine: All
@@ -27,7 +27,7 @@ AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments:  false       
+BinPackArguments:  false
 BinPackParameters: false
 BraceWrapping:
   AfterClass:            false
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 118a08ab26d..098e0d3e2cc 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -157,7 +157,7 @@ std::unique_ptr<column> make_fixed_width_column(data_type type,
   else if (is_duration   (type)) return make_duration_column   (type, size, state, stream, mr);
   else if (is_fixed_point(type)) return make_fixed_point_column(type, size, state, stream, mr);
   else                           return make_numeric_column    (type, size, state, stream, mr);
-  /// clang-format on
+  // clang-format on
 }
 
 std::unique_ptr<column> make_dictionary_from_scalar(scalar const& s,

From 578e65f09c1eb7c3fe1c600590b26acccca59a5d Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 12 Sep 2022 14:18:38 -0700
Subject: [PATCH 12/25] Enable ZSTD compression in ORC and Parquet writers
 (#11551)

Closes https://github.com/rapidsai/cudf/issues/9058, https://github.com/rapidsai/cudf/issues/9056

Expands nvCOMP adapter to include ZSTD compression.
Adds centralized nvCOMP policy. `is_compression_enabled`.
Adds centralized nvCOMP alignment utility, `compress_input_alignment_bits`.
Adds centralized nvCOMP utility to get the maximum supported compression chunk size - `batched_compress_max_allowed_chunk_size`.
Encoded ORC row groups are aligned based on compression requirements.
Encoded Parquet pages are aligned based on compression requirements.
Parquet fragment size now scales with the page size to better fit the default page size with ZSTD compression.
Small refactoring around `decompress_status` for improved type safety and hopefully naming.
Replaced `snappy_compress` from the Parquet writer with the nvCOMP adapter call.
Vectors of `compression_result`s are initialized before compression to avoid issues with random chunk skipping due to uninitialized memory.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Jim Brennan (https://github.com/jbrennan333)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Tobias Ribizel (https://github.com/upsj)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/11551
---
 cpp/src/io/avro/reader_impl.cu                |  12 +-
 cpp/src/io/comp/debrotli.cu                   |  15 +-
 cpp/src/io/comp/gpuinflate.cu                 |  20 +-
 cpp/src/io/comp/gpuinflate.hpp                |  32 ++--
 cpp/src/io/comp/nvcomp_adapter.cpp            | 147 +++++++++++---
 cpp/src/io/comp/nvcomp_adapter.cu             |  82 +++++---
 cpp/src/io/comp/nvcomp_adapter.cuh            |  28 ++-
 cpp/src/io/comp/nvcomp_adapter.hpp            |  49 ++++-
 cpp/src/io/comp/snap.cu                       |  13 +-
 cpp/src/io/comp/uncomp.cpp                    |   6 +-
 cpp/src/io/comp/unsnap.cu                     |  15 +-
 cpp/src/io/orc/orc.hpp                        |  12 +-
 cpp/src/io/orc/orc_common.hpp                 |   7 -
 cpp/src/io/orc/orc_gpu.hpp                    |  22 +--
 cpp/src/io/orc/reader_impl.cu                 |  36 ++--
 cpp/src/io/orc/stripe_enc.cu                  |  85 +++++----
 cpp/src/io/orc/stripe_init.cu                 |  14 +-
 cpp/src/io/orc/writer_impl.cu                 | 130 +++++++++----
 cpp/src/io/orc/writer_impl.hpp                |   2 +-
 cpp/src/io/parquet/page_enc.cu                |  54 +++---
 cpp/src/io/parquet/parquet_gpu.hpp            |  19 +-
 cpp/src/io/parquet/reader_impl.cu             |  33 ++--
 cpp/src/io/parquet/writer_impl.cu             | 179 ++++++++----------
 cpp/src/io/parquet/writer_impl.hpp            |   2 +-
 cpp/tests/io/comp/decomp_test.cpp             |  10 +-
 .../java/ai/rapids/cudf/CompressionType.java  |  20 +-
 python/cudf/cudf/_lib/orc.pyx                 |   2 +
 python/cudf/cudf/_lib/parquet.pyx             |   2 +
 python/cudf/cudf/tests/test_orc.py            |  19 +-
 python/cudf/cudf/tests/test_parquet.py        |  20 ++
 python/cudf/cudf/utils/ioutils.py             |   4 +-
 31 files changed, 686 insertions(+), 405 deletions(-)

diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index e5b73dc9360..7fcdf1bf29a 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -164,7 +164,11 @@ rmm::device_buffer decompress_data(datasource& source,
   if (meta.codec == "deflate") {
     auto inflate_in = hostdevice_vector<device_span<uint8_t const>>(meta.block_list.size(), stream);
     auto inflate_out   = hostdevice_vector<device_span<uint8_t>>(meta.block_list.size(), stream);
-    auto inflate_stats = hostdevice_vector<decompress_status>(meta.block_list.size(), stream);
+    auto inflate_stats = hostdevice_vector<compression_result>(meta.block_list.size(), stream);
+    thrust::fill(rmm::exec_policy(stream),
+                 inflate_stats.d_begin(),
+                 inflate_stats.d_end(),
+                 compression_result{0, compression_status::FAILURE});
 
     // Guess an initial maximum uncompressed block size. We estimate the compression factor is two
     // and round up to the next multiple of 4096 bytes.
@@ -190,8 +194,6 @@ rmm::device_buffer decompress_data(datasource& source,
 
     for (int loop_cnt = 0; loop_cnt < 2; loop_cnt++) {
       inflate_out.host_to_device(stream);
-      CUDF_CUDA_TRY(cudaMemsetAsync(
-        inflate_stats.device_ptr(), 0, inflate_stats.memory_size(), stream.value()));
       gpuinflate(inflate_in, inflate_out, inflate_stats, gzip_header_included::NO, stream);
       inflate_stats.device_to_host(stream, true);
 
@@ -204,9 +206,9 @@ rmm::device_buffer decompress_data(datasource& source,
                        inflate_stats.begin(),
                        std::back_inserter(actual_uncomp_sizes),
                        [](auto const& inf_out, auto const& inf_stats) {
-                         // If error status is 1 (buffer too small), the `bytes_written` field
+                         // If error status is OUTPUT_OVERFLOW, the `bytes_written` field
                          // actually contains the uncompressed data size
-                         return inf_stats.status == 1
+                         return inf_stats.status == compression_status::OUTPUT_OVERFLOW
                                   ? std::max(inf_out.size(), inf_stats.bytes_written)
                                   : inf_out.size();
                        });
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 07dc2cc9870..b6f2d2db811 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -1906,7 +1906,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
  *
  * @param[in] inputs Source buffer per block
  * @param[out] outputs Destination buffer per block
- * @param[out] statuses Decompressor status per block
+ * @param[out] results Decompressor status per block
  * @param scratch Intermediate device memory heap space (will be dynamically shared between blocks)
  * @param scratch_size Size of scratch heap space (smaller sizes may result in serialization between
  * blocks)
@@ -1914,7 +1914,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
 __global__ void __launch_bounds__(block_size, 2)
   gpu_debrotli_kernel(device_span<device_span<uint8_t const> const> inputs,
                       device_span<device_span<uint8_t> const> outputs,
-                      device_span<decompress_status> statuses,
+                      device_span<compression_result> results,
                       uint8_t* scratch,
                       uint32_t scratch_size)
 {
@@ -2016,10 +2016,11 @@ __global__ void __launch_bounds__(block_size, 2)
   __syncthreads();
   // Output decompression status
   if (!t) {
-    statuses[block_id].bytes_written = s->out - s->outbase;
-    statuses[block_id].status        = s->error;
+    results[block_id].bytes_written = s->out - s->outbase;
+    results[block_id].status =
+      (s->error == 0) ? compression_status::SUCCESS : compression_status::FAILURE;
     // Return ext heap used by last block (statistics)
-    statuses[block_id].reserved = s->fb_size;
+    results[block_id].reserved = s->fb_size;
   }
 }
 
@@ -2079,7 +2080,7 @@ size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs)
 
 void gpu_debrotli(device_span<device_span<uint8_t const> const> inputs,
                   device_span<device_span<uint8_t> const> outputs,
-                  device_span<decompress_status> statuses,
+                  device_span<compression_result> results,
                   void* scratch,
                   size_t scratch_size,
                   rmm::cuda_stream_view stream)
@@ -2104,7 +2105,7 @@ void gpu_debrotli(device_span<device_span<uint8_t const> const> inputs,
                                 cudaMemcpyHostToDevice,
                                 stream.value()));
   gpu_debrotli_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(
-    inputs, outputs, statuses, scratch_u8, fb_heap_size);
+    inputs, outputs, results, scratch_u8, fb_heap_size);
 #if DUMP_FB_HEAP
   uint32_t dump[2];
   uint32_t cur = 0;
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index 16f4ea84f7f..dacc5a00d16 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -1020,14 +1020,14 @@ __device__ int parse_gzip_header(const uint8_t* src, size_t src_size)
  * @tparam block_size Thread block dimension for this call
  * @param inputs Source and destination buffer information per block
  * @param outputs Destination buffer information per block
- * @param statuses Decompression status buffer per block
+ * @param results Decompression status buffer per block
  * @param parse_hdr If nonzero, indicates that the compressed bitstream includes a GZIP header
  */
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
   inflate_kernel(device_span<device_span<uint8_t const> const> inputs,
                  device_span<device_span<uint8_t> const> outputs,
-                 device_span<decompress_status> statuses,
+                 device_span<compression_result> results,
                  gzip_header_included parse_hdr)
 {
   __shared__ __align__(16) inflate_state_s state_g;
@@ -1133,9 +1133,15 @@ __global__ void __launch_bounds__(block_size)
       // Output buffer too small
       state->err = 1;
     }
-    statuses[z].bytes_written = state->out - state->outbase;
-    statuses[z].status        = state->err;
-    statuses[z].reserved      = (int)(state->end - state->cur);  // Here mainly for debug purposes
+    results[z].bytes_written = state->out - state->outbase;
+    results[z].status        = [&]() {
+      switch (state->err) {
+        case 0: return compression_status::SUCCESS;
+        case 1: return compression_status::OUTPUT_OVERFLOW;
+        default: return compression_status::FAILURE;
+      }
+    }();
+    results[z].reserved = (int)(state->end - state->cur);  // Here mainly for debug purposes
   }
 }
 
@@ -1200,14 +1206,14 @@ __global__ void __launch_bounds__(1024)
 
 void gpuinflate(device_span<device_span<uint8_t const> const> inputs,
                 device_span<device_span<uint8_t> const> outputs,
-                device_span<decompress_status> statuses,
+                device_span<compression_result> results,
                 gzip_header_included parse_hdr,
                 rmm::cuda_stream_view stream)
 {
   constexpr int block_size = 128;  // Threads per block
   if (inputs.size() > 0) {
     inflate_kernel<block_size>
-      <<<inputs.size(), block_size, 0, stream.value()>>>(inputs, outputs, statuses, parse_hdr);
+      <<<inputs.size(), block_size, 0, stream.value()>>>(inputs, outputs, results, parse_hdr);
   }
 }
 
diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp
index 3870b2ac3b3..1b45a31b13b 100644
--- a/cpp/src/io/comp/gpuinflate.hpp
+++ b/cpp/src/io/comp/gpuinflate.hpp
@@ -26,11 +26,21 @@ namespace cudf {
 namespace io {
 
 /**
- * @brief Output parameters for the decompression interface
+ * @brief Status of a compression/decompression operation.
  */
-struct decompress_status {
+enum class compression_status : uint8_t {
+  SUCCESS,          ///< Successful, output is valid
+  FAILURE,          ///< Failed, output is invalid (e.g. input is unsupported in some way)
+  SKIPPED,          ///< Operation skipped (if conversion, uncompressed data can be used)
+  OUTPUT_OVERFLOW,  ///< Output buffer is too small; operation can succeed with larger output
+};
+
+/**
+ * @brief Descriptor of compression/decompression result.
+ */
+struct compression_result {
   uint64_t bytes_written;
-  uint32_t status;
+  compression_status status;
   uint32_t reserved;
 };
 
@@ -44,13 +54,13 @@ enum class gzip_header_included { NO, YES };
  *
  * @param[in] inputs List of input buffers
  * @param[out] outputs List of output buffers
- * @param[out] statuses List of output status structures
+ * @param[out] results List of output status structures
  * @param[in] parse_hdr Whether or not to parse GZIP header
  * @param[in] stream CUDA stream to use
  */
 void gpuinflate(device_span<device_span<uint8_t const> const> inputs,
                 device_span<device_span<uint8_t> const> outputs,
-                device_span<decompress_status> statuses,
+                device_span<compression_result> results,
                 gzip_header_included parse_hdr,
                 rmm::cuda_stream_view stream);
 
@@ -73,12 +83,12 @@ void gpu_copy_uncompressed_blocks(device_span<device_span<uint8_t const> const>
  *
  * @param[in] inputs List of input buffers
  * @param[out] outputs List of output buffers
- * @param[out] statuses List of output status structures
+ * @param[out] results List of output status structures
  * @param[in] stream CUDA stream to use
  */
 void gpu_unsnap(device_span<device_span<uint8_t const> const> inputs,
                 device_span<device_span<uint8_t> const> outputs,
-                device_span<decompress_status> statuses,
+                device_span<compression_result> results,
                 rmm::cuda_stream_view stream);
 
 /**
@@ -98,14 +108,14 @@ size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0);
  *
  * @param[in] inputs List of input buffers
  * @param[out] outputs List of output buffers
- * @param[out] statuses List of output status structures
+ * @param[out] results List of output status structures
  * @param[in] scratch Temporary memory for intermediate work
  * @param[in] scratch_size Size in bytes of the temporary memory
  * @param[in] stream CUDA stream to use
  */
 void gpu_debrotli(device_span<device_span<uint8_t const> const> inputs,
                   device_span<device_span<uint8_t> const> outputs,
-                  device_span<decompress_status> statuses,
+                  device_span<compression_result> results,
                   void* scratch,
                   size_t scratch_size,
                   rmm::cuda_stream_view stream);
@@ -118,12 +128,12 @@ void gpu_debrotli(device_span<device_span<uint8_t const> const> inputs,
  *
  * @param[in] inputs List of input buffers
  * @param[out] outputs List of output buffers
- * @param[out] statuses List of output status structures
+ * @param[out] results List of output status structures
  * @param[in] stream CUDA stream to use
  */
 void gpu_snap(device_span<device_span<uint8_t const> const> inputs,
               device_span<device_span<uint8_t> const> outputs,
-              device_span<decompress_status> statuses,
+              device_span<compression_result> results,
               rmm::cuda_stream_view stream);
 
 }  // namespace io
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index 91deda50cf2..31f7b9b472e 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -21,17 +21,29 @@
 
 #include <nvcomp/snappy.h>
 
+#define NVCOMP_DEFLATE_HEADER <nvcomp/deflate.h>
+#if __has_include(NVCOMP_DEFLATE_HEADER)
+#include NVCOMP_DEFLATE_HEADER
+#endif
+
 #define NVCOMP_ZSTD_HEADER <nvcomp/zstd.h>
 #if __has_include(NVCOMP_ZSTD_HEADER)
 #include NVCOMP_ZSTD_HEADER
-#define NVCOMP_HAS_ZSTD 1
+#endif
+
+#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 3)
+#define NVCOMP_HAS_ZSTD_DECOMP 1
 #else
-#define NVCOMP_HAS_ZSTD 0
+#define NVCOMP_HAS_ZSTD_DECOMP 0
 #endif
 
-#define NVCOMP_DEFLATE_HEADER <nvcomp/deflate.h>
-#if __has_include(NVCOMP_DEFLATE_HEADER)
-#include NVCOMP_DEFLATE_HEADER
+#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 4)
+#define NVCOMP_HAS_ZSTD_COMP 1
+#else
+#define NVCOMP_HAS_ZSTD_COMP 0
+#endif
+
+#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 3)
 #define NVCOMP_HAS_DEFLATE 1
 #else
 #define NVCOMP_HAS_DEFLATE 0
@@ -63,7 +75,7 @@ nvcompStatus_t batched_decompress_get_temp_size_ex(compression_type compression,
     case compression_type::SNAPPY:
       return nvcompBatchedSnappyDecompressGetTempSizeEx(std::forward<Args>(args)...);
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD
+#if NVCOMP_HAS_ZSTD_DECOMP
       return nvcompBatchedZstdDecompressGetTempSizeEx(std::forward<Args>(args)...);
 #else
       CUDF_FAIL("Unsupported compression type");
@@ -83,7 +95,7 @@ auto batched_decompress_get_temp_size(compression_type compression, Args&&... ar
     case compression_type::SNAPPY:
       return nvcompBatchedSnappyDecompressGetTempSize(std::forward<Args>(args)...);
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD
+#if NVCOMP_HAS_ZSTD_DECOMP
       return nvcompBatchedZstdDecompressGetTempSize(std::forward<Args>(args)...);
 #else
       CUDF_FAIL("Unsupported compression type");
@@ -106,7 +118,7 @@ auto batched_decompress_async(compression_type compression, Args&&... args)
     case compression_type::SNAPPY:
       return nvcompBatchedSnappyDecompressAsync(std::forward<Args>(args)...);
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD
+#if NVCOMP_HAS_ZSTD_DECOMP
       return nvcompBatchedZstdDecompressAsync(std::forward<Args>(args)...);
 #else
       CUDF_FAIL("Unsupported compression type");
@@ -146,14 +158,14 @@ size_t batched_decompress_temp_size(compression_type compression,
 void batched_decompress(compression_type compression,
                         device_span<device_span<uint8_t const> const> inputs,
                         device_span<device_span<uint8_t> const> outputs,
-                        device_span<decompress_status> statuses,
+                        device_span<compression_result> results,
                         size_t max_uncomp_chunk_size,
                         size_t max_total_uncomp_size,
                         rmm::cuda_stream_view stream)
 {
   // TODO Consolidate config use to a common location
   if (compression == compression_type::ZSTD) {
-#if NVCOMP_HAS_ZSTD
+#if NVCOMP_HAS_ZSTD_DECOMP
 #if NVCOMP_ZSTD_IS_EXPERIMENTAL
     CUDF_EXPECTS(cudf::io::detail::nvcomp_integration::is_all_enabled(),
                  "Zstandard compression is experimental, you can enable it through "
@@ -187,7 +199,7 @@ void batched_decompress(compression_type compression,
                                                       stream.value());
   CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression");
 
-  convert_status(nvcomp_statuses, actual_uncompressed_data_sizes, statuses, stream);
+  update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream);
 }
 
 // Dispatcher for nvcompBatched<format>CompressGetTempSize
@@ -210,7 +222,14 @@ auto batched_compress_temp_size(compression_type compression,
 #else
       CUDF_FAIL("Unsupported compression type");
 #endif
-    case compression_type::ZSTD: [[fallthrough]];
+    case compression_type::ZSTD:
+#if NVCOMP_HAS_ZSTD_COMP
+      nvcomp_status = nvcompBatchedZstdCompressGetTempSize(
+        batch_size, max_uncompressed_chunk_bytes, nvcompBatchedZstdDefaultOpts, &temp_size);
+      break;
+#else
+      CUDF_FAIL("Unsupported compression type");
+#endif
     default: CUDF_FAIL("Unsupported compression type");
   }
 
@@ -219,26 +238,36 @@ auto batched_compress_temp_size(compression_type compression,
   return temp_size;
 }
 
-// Dispatcher for nvcompBatched<format>CompressGetMaxOutputChunkSize
-size_t batched_compress_get_max_output_chunk_size(compression_type compression,
-                                                  uint32_t max_uncompressed_chunk_bytes)
+size_t compress_max_output_chunk_size(compression_type compression,
+                                      uint32_t max_uncompressed_chunk_bytes)
 {
+  auto const capped_uncomp_bytes = std::min<size_t>(
+    compress_max_allowed_chunk_size(compression).value_or(max_uncompressed_chunk_bytes),
+    max_uncompressed_chunk_bytes);
+
   size_t max_comp_chunk_size = 0;
   nvcompStatus_t status      = nvcompStatus_t::nvcompSuccess;
   switch (compression) {
     case compression_type::SNAPPY:
       status = nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-        max_uncompressed_chunk_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size);
+        capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size);
       break;
     case compression_type::DEFLATE:
 #if NVCOMP_HAS_DEFLATE
       status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
-        max_uncompressed_chunk_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size);
+        capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size);
+      break;
+#else
+      CUDF_FAIL("Unsupported compression type");
+#endif
+    case compression_type::ZSTD:
+#if NVCOMP_HAS_ZSTD_COMP
+      status = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
+        capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size);
       break;
 #else
       CUDF_FAIL("Unsupported compression type");
 #endif
-    case compression_type::ZSTD: [[fallthrough]];
     default: CUDF_FAIL("Unsupported compression type");
   }
 
@@ -289,26 +318,50 @@ static void batched_compress_async(compression_type compression,
 #else
       CUDF_FAIL("Unsupported compression type");
 #endif
-    case compression_type::ZSTD: [[fallthrough]];
+    case compression_type::ZSTD:
+#if NVCOMP_HAS_ZSTD_COMP
+      nvcomp_status = nvcompBatchedZstdCompressAsync(device_uncompressed_ptrs,
+                                                     device_uncompressed_bytes,
+                                                     max_uncompressed_chunk_bytes,
+                                                     batch_size,
+                                                     device_temp_ptr,
+                                                     temp_bytes,
+                                                     device_compressed_ptrs,
+                                                     device_compressed_bytes,
+                                                     nvcompBatchedZstdDefaultOpts,
+                                                     stream.value());
+      break;
+#else
+      CUDF_FAIL("Unsupported compression type");
+#endif
     default: CUDF_FAIL("Unsupported compression type");
   }
   CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Error in compression");
 }
 
+bool is_aligned(void const* ptr, std::uintptr_t alignment) noexcept
+{
+  return (reinterpret_cast<std::uintptr_t>(ptr) % alignment) == 0;
+}
+
 void batched_compress(compression_type compression,
                       device_span<device_span<uint8_t const> const> inputs,
                       device_span<device_span<uint8_t> const> outputs,
-                      device_span<decompress_status> statuses,
-                      uint32_t max_uncomp_chunk_size,
+                      device_span<compression_result> results,
                       rmm::cuda_stream_view stream)
 {
   auto const num_chunks = inputs.size();
 
+  auto nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream);
+
+  auto const max_uncomp_chunk_size = skip_unsupported_inputs(
+    nvcomp_args.input_data_sizes, results, compress_max_allowed_chunk_size(compression), stream);
+
   auto const temp_size = batched_compress_temp_size(compression, num_chunks, max_uncomp_chunk_size);
   rmm::device_buffer scratch(temp_size, stream);
+  CUDF_EXPECTS(is_aligned(scratch.data(), 8), "Compression failed, misaligned scratch buffer");
 
   rmm::device_uvector<size_t> actual_compressed_data_sizes(num_chunks, stream);
-  auto const nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream);
 
   batched_compress_async(compression,
                          nvcomp_args.input_data_ptrs.data(),
@@ -321,7 +374,55 @@ void batched_compress(compression_type compression,
                          actual_compressed_data_sizes.data(),
                          stream.value());
 
-  convert_status(std::nullopt, actual_compressed_data_sizes, statuses, stream);
+  update_compression_results(actual_compressed_data_sizes, results, stream);
+}
+
+bool is_compression_enabled(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::DEFLATE:
+      return NVCOMP_HAS_DEFLATE and detail::nvcomp_integration::is_all_enabled();
+    case compression_type::SNAPPY: return detail::nvcomp_integration::is_stable_enabled();
+    case compression_type::ZSTD:
+      return NVCOMP_HAS_ZSTD_COMP and detail::nvcomp_integration::is_all_enabled();
+    default: return false;
+  }
+  return false;
+}
+
+size_t compress_input_alignment_bits(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::DEFLATE: return 0;
+    case compression_type::SNAPPY: return 0;
+    case compression_type::ZSTD: return 2;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+}
+
+size_t compress_output_alignment_bits(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::DEFLATE: return 3;
+    case compression_type::SNAPPY: return 0;
+    case compression_type::ZSTD: return 0;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+}
+
+std::optional<size_t> compress_max_allowed_chunk_size(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::DEFLATE: return 64 * 1024;
+    case compression_type::SNAPPY: return std::nullopt;
+    case compression_type::ZSTD:
+#if NVCOMP_HAS_ZSTD_COMP
+      return nvcompZstdCompressionMaxAllowedChunkSize;
+#else
+      CUDF_FAIL("Unsupported compression type");
+#endif
+    default: return std::nullopt;
+  }
 }
 
 }  // namespace cudf::io::nvcomp
diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu
index 30551dc31cf..c3c1bff9073 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cu
+++ b/cpp/src/io/comp/nvcomp_adapter.cu
@@ -57,31 +57,69 @@ batched_args create_batched_nvcomp_args(device_span<device_span<uint8_t const> c
           std::move(output_data_sizes)};
 }
 
-void convert_status(std::optional<device_span<nvcompStatus_t const>> nvcomp_stats,
-                    device_span<size_t const> actual_uncompressed_sizes,
-                    device_span<decompress_status> cudf_stats,
-                    rmm::cuda_stream_view stream)
+void update_compression_results(device_span<nvcompStatus_t const> nvcomp_stats,
+                                device_span<size_t const> actual_output_sizes,
+                                device_span<compression_result> results,
+                                rmm::cuda_stream_view stream)
 {
-  if (nvcomp_stats.has_value()) {
-    thrust::transform(
+  thrust::transform_if(
+    rmm::exec_policy(stream),
+    nvcomp_stats.begin(),
+    nvcomp_stats.end(),
+    actual_output_sizes.begin(),
+    results.begin(),
+    results.begin(),
+    [] __device__(auto const& nvcomp_status, auto const& size) {
+      return compression_result{size,
+                                nvcomp_status == nvcompStatus_t::nvcompSuccess
+                                  ? compression_status::SUCCESS
+                                  : compression_status::FAILURE};
+    },
+    [] __device__(auto const& cudf_status) {
+      return cudf_status.status != compression_status::SKIPPED;
+    });
+}
+
+void update_compression_results(device_span<size_t const> actual_output_sizes,
+                                device_span<compression_result> results,
+                                rmm::cuda_stream_view stream)
+{
+  thrust::transform_if(
+    rmm::exec_policy(stream),
+    actual_output_sizes.begin(),
+    actual_output_sizes.end(),
+    results.begin(),
+    results.begin(),
+    [] __device__(auto const& size) { return compression_result{size}; },
+    [] __device__(auto const& results) { return results.status != compression_status::SKIPPED; });
+}
+
+size_t skip_unsupported_inputs(device_span<size_t> input_sizes,
+                               device_span<compression_result> results,
+                               std::optional<size_t> max_valid_input_size,
+                               rmm::cuda_stream_view stream)
+{
+  if (max_valid_input_size.has_value()) {
+    auto status_size_it = thrust::make_zip_iterator(input_sizes.begin(), results.begin());
+    thrust::transform_if(
       rmm::exec_policy(stream),
-      nvcomp_stats->begin(),
-      nvcomp_stats->end(),
-      actual_uncompressed_sizes.begin(),
-      cudf_stats.begin(),
-      [] __device__(auto const& status, auto const& size) {
-        return decompress_status{size, status == nvcompStatus_t::nvcompSuccess ? 0u : 1u};
+      results.begin(),
+      results.end(),
+      input_sizes.begin(),
+      status_size_it,
+      [] __device__(auto const& status) {
+        return thrust::pair{0, compression_result{0, compression_status::SKIPPED}};
+      },
+      [max_size = max_valid_input_size.value()] __device__(size_t input_size) {
+        return input_size > max_size;
       });
-  } else {
-    thrust::transform(rmm::exec_policy(stream),
-                      actual_uncompressed_sizes.begin(),
-                      actual_uncompressed_sizes.end(),
-                      cudf_stats.begin(),
-                      [] __device__(size_t size) {
-                        decompress_status status{};
-                        status.bytes_written = size;
-                        return status;
-                      });
   }
+
+  return thrust::reduce(rmm::exec_policy(stream),
+                        input_sizes.begin(),
+                        input_sizes.end(),
+                        0ul,
+                        thrust::maximum<size_t>());
 }
+
 }  // namespace cudf::io::nvcomp
diff --git a/cpp/src/io/comp/nvcomp_adapter.cuh b/cpp/src/io/comp/nvcomp_adapter.cuh
index 1cc65d41a51..e49a9a6d348 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cuh
+++ b/cpp/src/io/comp/nvcomp_adapter.cuh
@@ -48,10 +48,28 @@ batched_args create_batched_nvcomp_args(device_span<device_span<uint8_t const> c
                                         rmm::cuda_stream_view stream);
 
 /**
- * @brief Convert nvcomp statuses into cuIO compression statuses.
+ * @brief Convert nvcomp statuses and output sizes into cuIO compression results.
  */
-void convert_status(std::optional<device_span<nvcompStatus_t const>> nvcomp_stats,
-                    device_span<size_t const> actual_uncompressed_sizes,
-                    device_span<decompress_status> cudf_stats,
-                    rmm::cuda_stream_view stream);
+void update_compression_results(device_span<nvcompStatus_t const> nvcomp_stats,
+                                device_span<size_t const> actual_output_sizes,
+                                device_span<compression_result> results,
+                                rmm::cuda_stream_view stream);
+
+/**
+ * @brief Fill the result array based on the actual output sizes.
+ */
+void update_compression_results(device_span<size_t const> actual_output_sizes,
+                                device_span<compression_result> results,
+                                rmm::cuda_stream_view stream);
+
+/**
+ * @brief Mark unsupported input chunks for skipping.
+ *
+ * Returns the size of the largest remaining input chunk.
+ */
+size_t skip_unsupported_inputs(device_span<size_t> input_sizes,
+                               device_span<compression_result> results,
+                               std::optional<size_t> max_valid_input_size,
+                               rmm::cuda_stream_view stream);
+
 }  // namespace cudf::io::nvcomp
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 40a85a3ac37..41af564ca76 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -18,6 +18,7 @@
 
 #include "gpuinflate.hpp"
 
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -26,13 +27,23 @@ namespace cudf::io::nvcomp {
 
 enum class compression_type { SNAPPY, ZSTD, DEFLATE };
 
+/**
+ * @brief Whether the given compression type is enabled through nvCOMP.
+ *
+ * Result depends on nvCOMP version and environment variables.
+ *
+ * @param compression Compression type
+ * @returns true if nvCOMP use is enabled; false otherwise
+ */
+[[nodiscard]] bool is_compression_enabled(compression_type compression);
+
 /**
  * @brief Device batch decompression of given type.
  *
  * @param[in] compression Compression type
  * @param[in] inputs List of input buffers
  * @param[out] outputs List of output buffers
- * @param[out] statuses List of output status structures
+ * @param[out] results List of output status structures
  * @param[in] max_uncomp_chunk_size maximum size of uncompressed chunk
  * @param[in] max_total_uncomp_size maximum total size of uncompressed data
  * @param[in] stream CUDA stream to use
@@ -40,7 +51,7 @@ enum class compression_type { SNAPPY, ZSTD, DEFLATE };
 void batched_decompress(compression_type compression,
                         device_span<device_span<uint8_t const> const> inputs,
                         device_span<device_span<uint8_t> const> outputs,
-                        device_span<decompress_status> statuses,
+                        device_span<compression_result> results,
                         size_t max_uncomp_chunk_size,
                         size_t max_total_uncomp_size,
                         rmm::cuda_stream_view stream);
@@ -51,8 +62,32 @@ void batched_decompress(compression_type compression,
  * @param compression Compression type
  * @param max_uncomp_chunk_size Size of the largest uncompressed chunk in the batch
  */
-size_t batched_compress_get_max_output_chunk_size(compression_type compression,
-                                                  uint32_t max_uncomp_chunk_size);
+[[nodiscard]] size_t compress_max_output_chunk_size(compression_type compression,
+                                                    uint32_t max_uncomp_chunk_size);
+
+/**
+ * @brief Gets input alignment requirements for the given compression type.
+ *
+ * @param compression Compression type
+ * @returns required alignment, in bits
+ */
+[[nodiscard]] size_t compress_input_alignment_bits(compression_type compression);
+
+/**
+ * @brief Gets output alignment requirements for the given compression type.
+ *
+ * @param compression Compression type
+ * @returns required alignment, in bits
+ */
+[[nodiscard]] size_t compress_output_alignment_bits(compression_type compression);
+
+/**
+ * @brief Maximum size of uncompressed chunks that can be compressed with nvCOMP.
+ *
+ * @param compression Compression type
+ * @returns maximum chunk size
+ */
+[[nodiscard]] std::optional<size_t> compress_max_allowed_chunk_size(compression_type compression);
 
 /**
  * @brief Device batch compression of given type.
@@ -60,15 +95,13 @@ size_t batched_compress_get_max_output_chunk_size(compression_type compression,
  * @param[in] compression Compression type
  * @param[in] inputs List of input buffers
  * @param[out] outputs List of output buffers
- * @param[out] statuses List of output status structures
- * @param[in] max_uncomp_chunk_size Size of the largest uncompressed chunk in the batch
+ * @param[out] results List of output status structures
  * @param[in] stream CUDA stream to use
  */
 void batched_compress(compression_type compression,
                       device_span<device_span<uint8_t const> const> inputs,
                       device_span<device_span<uint8_t> const> outputs,
-                      device_span<decompress_status> statuses,
-                      uint32_t max_uncomp_chunk_size,
+                      device_span<compression_result> results,
                       rmm::cuda_stream_view stream);
 
 }  // namespace cudf::io::nvcomp
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
index 820a7f937d7..6c7ab490751 100644
--- a/cpp/src/io/comp/snap.cu
+++ b/cpp/src/io/comp/snap.cu
@@ -260,7 +260,7 @@ static __device__ uint32_t Match60(const uint8_t* src1,
 __global__ void __launch_bounds__(128)
   snap_kernel(device_span<device_span<uint8_t const> const> inputs,
               device_span<device_span<uint8_t> const> outputs,
-              device_span<decompress_status> statuses)
+              device_span<compression_result> results)
 {
   __shared__ __align__(16) snap_state_s state_g;
 
@@ -337,21 +337,22 @@ __global__ void __launch_bounds__(128)
   }
   __syncthreads();
   if (!t) {
-    statuses[blockIdx.x].bytes_written = s->dst - s->dst_base;
-    statuses[blockIdx.x].status        = (s->dst > s->end) ? 1 : 0;
-    statuses[blockIdx.x].reserved      = 0;
+    results[blockIdx.x].bytes_written = s->dst - s->dst_base;
+    results[blockIdx.x].status =
+      (s->dst > s->end) ? compression_status::FAILURE : compression_status::SUCCESS;
+    results[blockIdx.x].reserved = 0;
   }
 }
 
 void gpu_snap(device_span<device_span<uint8_t const> const> inputs,
               device_span<device_span<uint8_t> const> outputs,
-              device_span<decompress_status> statuses,
+              device_span<compression_result> results,
               rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);  // 4 warps per stream, 1 stream per block
   dim3 dim_grid(inputs.size(), 1);
   if (inputs.size() > 0) {
-    snap_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs, statuses);
+    snap_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs, results);
   }
 }
 
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 6f33c9f1de9..8e58f86317c 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -520,7 +520,9 @@ size_t decompress_zstd(host_span<uint8_t const> src,
   hd_dsts[0]   = d_dst;
   hd_dsts.host_to_device(stream);
 
-  auto hd_stats                   = hostdevice_vector<decompress_status>(1, stream);
+  auto hd_stats = hostdevice_vector<compression_result>(1, stream);
+  hd_stats[0]   = compression_result{0, compression_status::FAILURE};
+  hd_stats.host_to_device(stream);
   auto const max_uncomp_page_size = dst.size();
   nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
                              hd_srcs,
@@ -531,7 +533,7 @@ size_t decompress_zstd(host_span<uint8_t const> src,
                              stream);
 
   hd_stats.device_to_host(stream, true);
-  CUDF_EXPECTS(hd_stats[0].status == 0, "ZSTD decompression failed");
+  CUDF_EXPECTS(hd_stats[0].status == compression_status::SUCCESS, "ZSTD decompression failed");
 
   // Copy temporary output to `dst`
   CUDF_CUDA_TRY(cudaMemcpyAsync(
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index 98011a57ea8..8b13ddd1de4 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -627,7 +627,7 @@ template <int block_size>
 __global__ void __launch_bounds__(block_size)
   unsnap_kernel(device_span<device_span<uint8_t const> const> inputs,
                 device_span<device_span<uint8_t> const> outputs,
-                device_span<decompress_status> statuses)
+                device_span<compression_result> results)
 {
   __shared__ __align__(16) unsnap_state_s state_g;
   __shared__ cub::WarpReduce<uint32_t>::TempStorage temp_storage;
@@ -698,25 +698,26 @@ __global__ void __launch_bounds__(block_size)
     __syncthreads();
   }
   if (!t) {
-    statuses[strm_id].bytes_written = s->uncompressed_size - s->bytes_left;
-    statuses[strm_id].status        = s->error;
+    results[strm_id].bytes_written = s->uncompressed_size - s->bytes_left;
+    results[strm_id].status =
+      (s->error == 0) ? compression_status::SUCCESS : compression_status::FAILURE;
     if (log_cyclecount) {
-      statuses[strm_id].reserved = clock() - s->tstart;
+      results[strm_id].reserved = clock() - s->tstart;
     } else {
-      statuses[strm_id].reserved = 0;
+      results[strm_id].reserved = 0;
     }
   }
 }
 
 void gpu_unsnap(device_span<device_span<uint8_t const> const> inputs,
                 device_span<device_span<uint8_t> const> outputs,
-                device_span<decompress_status> statuses,
+                device_span<compression_result> results,
                 rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);           // 4 warps per stream, 1 stream per block
   dim3 dim_grid(inputs.size(), 1);  // TODO: Check max grid dimensions vs max expected count
 
-  unsnap_kernel<128><<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs, statuses);
+  unsnap_kernel<128><<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs, results);
 }
 
 }  // namespace io
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 858f7682b11..a007750d264 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -38,12 +38,12 @@ namespace cudf {
 namespace io {
 namespace orc {
 struct PostScript {
-  uint64_t footerLength         = 0;           // the length of the footer section in bytes
-  CompressionKind compression   = NONE;        // the kind of generic compression used
-  uint32_t compressionBlockSize = 256 * 1024;  // the maximum size of each compression chunk
-  std::vector<uint32_t> version;               // the version of the writer [major, minor]
-  uint64_t metadataLength = 0;                 // the length of the metadata section in bytes
-  std::string magic       = "";                // the fixed string "ORC"
+  uint64_t footerLength       = 0;     // the length of the footer section in bytes
+  CompressionKind compression = NONE;  // the kind of generic compression used
+  uint32_t compressionBlockSize{};     // the maximum size of each compression chunk
+  std::vector<uint32_t> version;       // the version of the writer [major, minor]
+  uint64_t metadataLength = 0;         // the length of the metadata section in bytes
+  std::string magic       = "";        // the fixed string "ORC"
 };
 
 struct StripeInformation {
diff --git a/cpp/src/io/orc/orc_common.hpp b/cpp/src/io/orc/orc_common.hpp
index 29a4ad6ed78..c2898b362a6 100644
--- a/cpp/src/io/orc/orc_common.hpp
+++ b/cpp/src/io/orc/orc_common.hpp
@@ -24,13 +24,6 @@ namespace orc {
 
 static constexpr uint32_t block_header_size = 3;
 
-constexpr uint32_t compressed_block_size(uint32_t compressed_data_size)
-{
-  return ((compressed_data_size + block_header_size + 0xFF) & ~0xFF);
-}
-
-static constexpr uint32_t padded_block_header_size = compressed_block_size(0);
-
 enum CompressionKind : uint8_t {
   NONE   = 0,
   ZLIB   = 1,
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 9de7dfffc0c..c7a7a423cf2 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -56,12 +56,12 @@ struct CompressedStreamInfo {
   }
   const uint8_t* compressed_data;  // [in] base ptr to compressed stream data
   uint8_t* uncompressed_data;  // [in] base ptr to uncompressed stream data or NULL if not known yet
-  size_t compressed_data_size;               // [in] compressed data size for this stream
-  device_span<uint8_t const>* dec_in_ctl;    // [in] input buffer to decompress
-  device_span<uint8_t>* dec_out_ctl;         // [in] output buffer to decompress into
-  device_span<decompress_status> decstatus;  // [in] results of decompression
-  device_span<uint8_t const>* copy_in_ctl;   // [out] input buffer to copy
-  device_span<uint8_t>* copy_out_ctl;        // [out] output buffer to copy to
+  size_t compressed_data_size;              // [in] compressed data size for this stream
+  device_span<uint8_t const>* dec_in_ctl;   // [in] input buffer to decompress
+  device_span<uint8_t>* dec_out_ctl;        // [in] output buffer to decompress into
+  device_span<compression_result> dec_res;  // [in] results of decompression
+  device_span<uint8_t const>* copy_in_ctl;  // [out] input buffer to copy
+  device_span<uint8_t>* copy_out_ctl;       // [out] output buffer to copy to
   uint32_t num_compressed_blocks;  // [in,out] number of entries in decctl(in), number of compressed
                                    // blocks(out)
   uint32_t num_uncompressed_blocks;      // [in,out] number of entries in dec_in_ctl(in), number of
@@ -348,11 +348,10 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
  * @param[in] compression Type of compression
  * @param[in] comp_blk_size Compression block size
  * @param[in] max_comp_blk_size Max size of any block after compression
+ * @param[in] comp_block_align Required alignment for compressed blocks
  * @param[in,out] strm_desc StripeStream device array [stripe][stream]
  * @param[in,out] enc_streams chunk streams device array [column][rowgroup]
- * @param[out] comp_in Per-block compression input buffers
- * @param[out] comp_out Per-block compression output buffers
- * @param[out] comp_stat Per-block compression status
+ * @param[out] comp_res Per-block compression status
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  */
 void CompressOrcDataStreams(uint8_t* compressed_data,
@@ -360,11 +359,10 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
                             uint32_t max_comp_blk_size,
+                            uint32_t comp_block_align,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            device_span<device_span<uint8_t const>> comp_in,
-                            device_span<device_span<uint8_t>> comp_out,
-                            device_span<decompress_status> comp_stat,
+                            device_span<compression_result> comp_res,
                             rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index c79aa5d7a4f..7ff3ee85939 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -262,26 +262,26 @@ auto decimal_column_type(std::vector<std::string> const& decimal128_columns,
 
 }  // namespace
 
-__global__ void decompress_check_kernel(device_span<decompress_status const> stats,
+__global__ void decompress_check_kernel(device_span<compression_result const> results,
                                         bool* any_block_failure)
 {
   auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < stats.size()) {
-    if (stats[tid].status != 0) {
+  if (tid < results.size()) {
+    if (results[tid].status != compression_status::SUCCESS) {
       *any_block_failure = true;  // Doesn't need to be atomic
     }
   }
 }
 
-void decompress_check(device_span<decompress_status> stats,
+void decompress_check(device_span<compression_result> results,
                       bool* any_block_failure,
                       rmm::cuda_stream_view stream)
 {
-  if (stats.empty()) { return; }  // early exit for empty stats
+  if (results.empty()) { return; }  // early exit for empty results
 
   dim3 block(128);
-  dim3 grid(cudf::util::div_rounding_up_safe(stats.size(), static_cast<size_t>(block.x)));
-  decompress_check_kernel<<<grid, block, 0, stream.value()>>>(stats, any_block_failure);
+  dim3 grid(cudf::util::div_rounding_up_safe(results.size(), static_cast<size_t>(block.x)));
+  decompress_check_kernel<<<grid, block, 0, stream.value()>>>(results, any_block_failure);
 }
 
 rmm::device_buffer reader::impl::decompress_stripe_data(
@@ -337,7 +337,11 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
     num_compressed_blocks + num_uncompressed_blocks, stream);
   rmm::device_uvector<device_span<uint8_t>> inflate_out(
     num_compressed_blocks + num_uncompressed_blocks, stream);
-  rmm::device_uvector<decompress_status> inflate_stats(num_compressed_blocks, stream);
+  rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               inflate_res.begin(),
+               inflate_res.end(),
+               compression_result{0, compression_status::FAILURE});
 
   // Parse again to populate the decompression input/output buffers
   size_t decomp_offset           = 0;
@@ -349,8 +353,8 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
     compinfo[i].uncompressed_data = dst_base + decomp_offset;
     compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
     compinfo[i].dec_out_ctl       = inflate_out.data() + start_pos;
-    compinfo[i].decstatus   = {inflate_stats.data() + start_pos, compinfo[i].num_compressed_blocks};
-    compinfo[i].copy_in_ctl = inflate_in.data() + start_pos_uncomp;
+    compinfo[i].dec_res      = {inflate_res.data() + start_pos, compinfo[i].num_compressed_blocks};
+    compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
     compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
 
     stream_info[i].dst_pos = decomp_offset;
@@ -379,13 +383,13 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
           nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE,
                                      inflate_in_view,
                                      inflate_out_view,
-                                     inflate_stats,
+                                     inflate_res,
                                      max_uncomp_block_size,
                                      total_decomp_size,
                                      stream);
         } else {
           gpuinflate(
-            inflate_in_view, inflate_out_view, inflate_stats, gzip_header_included::NO, stream);
+            inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream);
         }
         break;
       case compression_type::SNAPPY:
@@ -393,26 +397,26 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
                                      inflate_in_view,
                                      inflate_out_view,
-                                     inflate_stats,
+                                     inflate_res,
                                      max_uncomp_block_size,
                                      total_decomp_size,
                                      stream);
         } else {
-          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_stats, stream);
+          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream);
         }
         break;
       case compression_type::ZSTD:
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
                                    inflate_in_view,
                                    inflate_out_view,
-                                   inflate_stats,
+                                   inflate_res,
                                    max_uncomp_block_size,
                                    total_decomp_size,
                                    stream);
         break;
       default: CUDF_FAIL("Unexpected decompression dispatch"); break;
     }
-    decompress_check(inflate_stats, any_block_failure.device_ptr(), stream);
+    decompress_check(inflate_res, any_block_failure.device_ptr(), stream);
   }
   if (num_uncompressed_blocks > 0) {
     device_span<device_span<uint8_t const>> copy_in_view{inflate_in.data() + num_compressed_blocks,
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 5e9a6f8df6b..b1c04099e64 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -17,14 +17,16 @@
 #include "orc_common.hpp"
 #include "orc_gpu.hpp"
 
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/utilities/bit.hpp>
 #include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/block_utils.cuh>
 #include <io/utilities/config_utils.hpp>
 #include <io/utilities/time_utils.cuh>
 
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/bit.hpp>
+
 #include <cub/cub.cuh>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -1142,10 +1144,11 @@ __global__ void __launch_bounds__(1024)
  * @param[in] chunks EncChunk device array [rowgroup][column]
  * @param[out] inputs Per-block compression input buffers
  * @param[out] outputs Per-block compression output buffers
- * @param[out] statuses Per-block compression status
+ * @param[out] results Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
  * @param[in] max_comp_blk_size Max size of any block after compression
+ * @param[in] comp_block_align Required alignment for compressed blocks
  */
 // blockDim {256,1,1}
 __global__ void __launch_bounds__(256)
@@ -1153,14 +1156,18 @@ __global__ void __launch_bounds__(256)
                            device_2dspan<encoder_chunk_streams> streams,  // const?
                            device_span<device_span<uint8_t const>> inputs,
                            device_span<device_span<uint8_t>> outputs,
-                           device_span<decompress_status> statuses,
+                           device_span<compression_result> results,
                            uint8_t* compressed_bfr,
                            uint32_t comp_blk_size,
-                           uint32_t max_comp_blk_size)
+                           uint32_t max_comp_blk_size,
+                           uint32_t comp_block_align)
 {
   __shared__ __align__(16) StripeStream ss;
   __shared__ uint8_t* volatile uncomp_base_g;
 
+  auto const padded_block_header_size = util::round_up_unsafe(block_header_size, comp_block_align);
+  auto const padded_comp_block_size   = util::round_up_unsafe(max_comp_blk_size, comp_block_align);
+
   auto const stripe_id = blockIdx.x;
   auto const stream_id = blockIdx.y;
   uint32_t t           = threadIdx.x;
@@ -1177,10 +1184,10 @@ __global__ void __launch_bounds__(256)
   num_blocks = (ss.stream_size > 0) ? (ss.stream_size - 1) / comp_blk_size + 1 : 1;
   for (uint32_t b = t; b < num_blocks; b += 256) {
     uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size));
-    inputs[ss.first_block + b] = {src + b * comp_blk_size, blk_size};
-    auto const dst_offset = b * compressed_block_size(max_comp_blk_size) + padded_block_header_size;
-    outputs[ss.first_block + b]  = {dst + dst_offset, max_comp_blk_size};
-    statuses[ss.first_block + b] = {blk_size, 1, 0};
+    inputs[ss.first_block + b]  = {src + b * comp_blk_size, blk_size};
+    auto const dst_offset       = b * (padded_block_header_size + padded_comp_block_size);
+    outputs[ss.first_block + b] = {dst + dst_offset, max_comp_blk_size};
+    results[ss.first_block + b] = {0, compression_status::FAILURE};
   }
 }
 
@@ -1190,9 +1197,9 @@ __global__ void __launch_bounds__(256)
  *
  * @param[in,out] strm_desc StripeStream device array [stripe][stream]
  * @param[in] chunks EncChunk device array [rowgroup][column]
- * @param[out] inputs Per-block compression input buffers
+ * @param[in] inputs Per-block compression input buffers
  * @param[out] outputs Per-block compression output buffers
- * @param[out] statuses Per-block compression status
+ * @param[out] results Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
  * @param[in] max_comp_blk_size Max size of any block after compression
@@ -1202,7 +1209,7 @@ __global__ void __launch_bounds__(1024)
   gpuCompactCompressedBlocks(device_2dspan<StripeStream> strm_desc,
                              device_span<device_span<uint8_t const> const> inputs,
                              device_span<device_span<uint8_t> const> outputs,
-                             device_span<decompress_status> statuses,
+                             device_span<compression_result> results,
                              uint8_t* compressed_bfr,
                              uint32_t comp_blk_size,
                              uint32_t max_comp_blk_size)
@@ -1228,16 +1235,16 @@ __global__ void __launch_bounds__(1024)
     if (t == 0) {
       auto const src_len =
         min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size));
-      auto dst_len = (statuses[ss.first_block + b].status == 0)
-                       ? statuses[ss.first_block + b].bytes_written
+      auto dst_len = (results[ss.first_block + b].status == compression_status::SUCCESS)
+                       ? results[ss.first_block + b].bytes_written
                        : src_len;
       uint32_t blk_size24{};
-      if (statuses[ss.first_block + b].status == 0) {
+      if (results[ss.first_block + b].status == compression_status::SUCCESS) {
         // Copy from uncompressed source
-        src                                        = inputs[ss.first_block + b].data();
-        statuses[ss.first_block + b].bytes_written = src_len;
-        dst_len                                    = src_len;
-        blk_size24                                 = dst_len * 2 + 1;
+        src                                       = inputs[ss.first_block + b].data();
+        results[ss.first_block + b].bytes_written = src_len;
+        dst_len                                   = src_len;
+        blk_size24                                = dst_len * 2 + 1;
       } else {
         // Compressed block
         src        = outputs[ss.first_block + b].data();
@@ -1307,51 +1314,59 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
                             uint32_t max_comp_blk_size,
+                            uint32_t comp_block_align,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            device_span<device_span<uint8_t const>> comp_in,
-                            device_span<device_span<uint8_t>> comp_out,
-                            device_span<decompress_status> comp_stat,
+                            device_span<compression_result> comp_res,
                             rmm::cuda_stream_view stream)
 {
+  rmm::device_uvector<device_span<uint8_t const>> comp_in(num_compressed_blocks, stream);
+  rmm::device_uvector<device_span<uint8_t>> comp_out(num_compressed_blocks, stream);
+
   dim3 dim_block_init(256, 1);
   dim3 dim_grid(strm_desc.size().first, strm_desc.size().second);
   gpuInitCompressionBlocks<<<dim_grid, dim_block_init, 0, stream.value()>>>(strm_desc,
                                                                             enc_streams,
                                                                             comp_in,
                                                                             comp_out,
-                                                                            comp_stat,
+                                                                            comp_res,
                                                                             compressed_data,
                                                                             comp_blk_size,
-                                                                            max_comp_blk_size);
+                                                                            max_comp_blk_size,
+                                                                            comp_block_align);
 
   if (compression == SNAPPY) {
     try {
-      if (detail::nvcomp_integration::is_stable_enabled()) {
+      if (nvcomp::is_compression_enabled(nvcomp::compression_type::SNAPPY)) {
         nvcomp::batched_compress(
-          nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_stat, comp_blk_size, stream);
+          nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
       } else {
-        gpu_snap(comp_in, comp_out, comp_stat, stream);
+        gpu_snap(comp_in, comp_out, comp_res, stream);
       }
     } catch (...) {
       // There was an error in compressing so set an error status for each block
-      thrust::for_each(rmm::exec_policy(stream),
-                       comp_stat.begin(),
-                       comp_stat.end(),
-                       [] __device__(decompress_status & stat) { stat.status = 1; });
+      thrust::for_each(
+        rmm::exec_policy(stream),
+        comp_res.begin(),
+        comp_res.end(),
+        [] __device__(compression_result & stat) { stat.status = compression_status::FAILURE; });
       // Since SNAPPY is the default compression (may not be explicitly requested), fall back to
       // writing without compression
     }
-  } else if (compression == ZLIB and detail::nvcomp_integration::is_all_enabled()) {
+  } else if (compression == ZLIB and
+             nvcomp::is_compression_enabled(nvcomp::compression_type::DEFLATE)) {
     nvcomp::batched_compress(
-      nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_stat, comp_blk_size, stream);
+      nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_res, stream);
+  } else if (compression == ZSTD and
+             nvcomp::is_compression_enabled(nvcomp::compression_type::ZSTD)) {
+    nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
   } else if (compression != NONE) {
     CUDF_FAIL("Unsupported compression type");
   }
 
   dim3 dim_block_compact(1024, 1);
   gpuCompactCompressedBlocks<<<dim_grid, dim_block_compact, 0, stream.value()>>>(
-    strm_desc, comp_in, comp_out, comp_stat, compressed_data, comp_blk_size, max_comp_blk_size);
+    strm_desc, comp_in, comp_out, comp_res, compressed_data, comp_blk_size, max_comp_blk_size);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index edae60bfa6d..bd65089810e 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -160,7 +160,7 @@ __global__ void __launch_bounds__(128, 8)
     const uint8_t* cur              = s->info.compressed_data;
     const uint8_t* end              = cur + s->info.compressed_data_size;
     auto dec_out                    = s->info.dec_out_ctl;
-    auto dec_status                 = s->info.decstatus;
+    auto dec_result                 = s->info.dec_res;
     uint8_t* uncompressed_actual    = s->info.uncompressed_data;
     uint8_t* uncompressed_estimated = uncompressed_actual;
     uint32_t num_compressed_blocks  = 0;
@@ -178,13 +178,9 @@ __global__ void __launch_bounds__(128, 8)
         uncompressed_size_actual = block_len;
       } else {
         if (num_compressed_blocks > max_compressed_blocks) { break; }
-        if (shuffle((lane_id == 0) ? dec_status[num_compressed_blocks].status : 0) != 0) {
-          // Decompression failed, not much point in doing anything else
-          break;
-        }
         uint32_t const dst_size      = dec_out[num_compressed_blocks].size();
         uncompressed_size_est        = shuffle((lane_id == 0) ? dst_size : 0);
-        uint32_t const bytes_written = dec_status[num_compressed_blocks].bytes_written;
+        uint32_t const bytes_written = dec_result[num_compressed_blocks].bytes_written;
         uncompressed_size_actual     = shuffle((lane_id == 0) ? bytes_written : 0);
       }
       // In practice, this should never happen with a well-behaved writer, as we would expect the
@@ -383,7 +379,7 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s,
       const uint8_t* start   = s->strm_info[ci_id].compressed_data;
       const uint8_t* cur     = start;
       const uint8_t* end     = cur + s->strm_info[ci_id].compressed_data_size;
-      auto decstatus         = s->strm_info[ci_id].decstatus.data();
+      auto dec_result        = s->strm_info[ci_id].dec_res.data();
       uint32_t uncomp_offset = 0;
       for (;;) {
         uint32_t block_len;
@@ -400,8 +396,8 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s,
         if (is_uncompressed) {
           uncomp_offset += block_len;
         } else {
-          uncomp_offset += decstatus->bytes_written;
-          decstatus++;
+          uncomp_offset += dec_result->bytes_written;
+          dec_result++;
         }
       }
       s->rowgroups[t].strm_offset[ci_id] += uncomp_offset;
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 90858ac6fcc..a5e9e9da4cb 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -85,7 +85,18 @@ template <typename T>
 using pinned_buffer = std::unique_ptr<T, decltype(&cudaFreeHost)>;
 
 /**
- * @brief Function that translates GDF compression to ORC compression
+ * @brief Translates ORC compression to nvCOMP compression
+ */
+auto to_nvcomp_compression_type(CompressionKind compression_kind)
+{
+  if (compression_kind == SNAPPY) return nvcomp::compression_type::SNAPPY;
+  if (compression_kind == ZLIB) return nvcomp::compression_type::DEFLATE;
+  if (compression_kind == ZSTD) return nvcomp::compression_type::ZSTD;
+  CUDF_FAIL("Unsupported compression type");
+}
+
+/**
+ * @brief Translates cuDF compression to ORC compression
  */
 orc::CompressionKind to_orc_compression(compression_type compression)
 {
@@ -93,27 +104,30 @@ orc::CompressionKind to_orc_compression(compression_type compression)
     case compression_type::AUTO:
     case compression_type::SNAPPY: return orc::CompressionKind::SNAPPY;
     case compression_type::ZLIB: return orc::CompressionKind::ZLIB;
+    case compression_type::ZSTD: return orc::CompressionKind::ZSTD;
     case compression_type::NONE: return orc::CompressionKind::NONE;
-    default: CUDF_FAIL("Unsupported compression type"); return orc::CompressionKind::NONE;
+    default: CUDF_FAIL("Unsupported compression type");
   }
 }
 
 /**
  * @brief Returns the block size for a given compression kind.
- *
- * The nvCOMP ZLIB compression is limited to blocks up to 64KiB.
  */
 constexpr size_t compression_block_size(orc::CompressionKind compression)
 {
-  switch (compression) {
-    case orc::CompressionKind::NONE: return 0;
-    case orc::CompressionKind::ZLIB: return 64 * 1024;
-    default: return 256 * 1024;
-  }
+  if (compression == orc::CompressionKind::NONE) { return 0; }
+
+  auto const ncomp_type   = to_nvcomp_compression_type(compression);
+  auto const nvcomp_limit = nvcomp::is_compression_enabled(ncomp_type)
+                              ? nvcomp::compress_max_allowed_chunk_size(ncomp_type)
+                              : std::nullopt;
+
+  constexpr size_t max_block_size = 256 * 1024;
+  return std::min(nvcomp_limit.value_or(max_block_size), max_block_size);
 }
 
 /**
- * @brief Function that translates GDF dtype to ORC datatype
+ * @brief Translates cuDF dtype to ORC datatype
  */
 constexpr orc::TypeKind to_orc_type(cudf::type_id id, bool list_column_as_map)
 {
@@ -520,6 +534,26 @@ constexpr size_t RLE_stream_size(TypeKind kind, size_t count)
   }
 }
 
+auto uncomp_block_alignment(CompressionKind compression_kind)
+{
+  if (compression_kind == NONE or
+      not nvcomp::is_compression_enabled(to_nvcomp_compression_type(compression_kind))) {
+    return 1u;
+  }
+
+  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(compression_kind));
+}
+
+auto comp_block_alignment(CompressionKind compression_kind)
+{
+  if (compression_kind == NONE or
+      not nvcomp::is_compression_enabled(to_nvcomp_compression_type(compression_kind))) {
+    return 1u;
+  }
+
+  return 1u << nvcomp::compress_output_alignment_bits(to_nvcomp_compression_type(compression_kind));
+}
+
 orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
                                          file_segmentation const& segmentation,
                                          std::map<uint32_t, size_t> const& decimal_column_sizes)
@@ -565,9 +599,13 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
 
     auto add_stream =
       [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) {
-        const auto base        = column.index() * gpu::CI_NUM_STREAMS;
-        ids[base + index_type] = streams.size();
-        streams.push_back(orc::Stream{kind, column.id(), size});
+        auto const max_alignment_padding = uncomp_block_alignment(compression_kind_) - 1;
+        const auto base                  = column.index() * gpu::CI_NUM_STREAMS;
+        ids[base + index_type]           = streams.size();
+        streams.push_back(orc::Stream{
+          kind,
+          column.id(),
+          (size == 0) ? 0 : size + max_alignment_padding * segmentation.num_rowgroups()});
         types.push_back(type_kind);
       };
 
@@ -868,6 +906,7 @@ encoded_data encode_columns(orc_table_view const& orc_table,
                             encoder_decimal_info&& dec_chunk_sizes,
                             file_segmentation const& segmentation,
                             orc_streams const& streams,
+                            uint32_t uncomp_block_align,
                             rmm::cuda_stream_view stream)
 {
   auto const num_columns = orc_table.num_columns();
@@ -1020,10 +1059,16 @@ encoded_data encode_columns(orc_table_view const& orc_table,
             strm.lengths[strm_type]   = 0;
             strm.data_ptrs[strm_type] = nullptr;
           }
+          auto const misalignment =
+            reinterpret_cast<intptr_t>(strm.data_ptrs[strm_type]) % uncomp_block_align;
+          if (misalignment != 0) {
+            strm.data_ptrs[strm_type] += (uncomp_block_align - misalignment);
+          }
         }
       }
     }
   }
+
   chunk_streams.host_to_device(stream);
 
   if (orc_table.num_rows() > 0) {
@@ -1340,7 +1385,7 @@ void writer::impl::write_index_stream(int32_t stripe_id,
                                       file_segmentation const& segmentation,
                                       host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
                                       host_2dspan<gpu::StripeStream const> strm_desc,
-                                      host_span<decompress_status const> comp_out,
+                                      host_span<compression_result const> comp_res,
                                       std::vector<ColStatsBlob> const& rg_stats,
                                       StripeInformation* stripe,
                                       orc_streams* streams,
@@ -1365,17 +1410,17 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     }
     return record;
   };
-  auto scan_record = [=, &comp_out](gpu::encoder_chunk_streams const& stream,
+  auto scan_record = [=, &comp_res](gpu::encoder_chunk_streams const& stream,
                                     gpu::StreamIndexType type,
                                     row_group_index_info& record) {
     if (record.pos >= 0) {
       record.pos += stream.lengths[type];
       while ((record.pos >= 0) && (record.blk_pos >= 0) &&
              (static_cast<size_t>(record.pos) >= compression_blocksize_) &&
-             (record.comp_pos + block_header_size + comp_out[record.blk_pos].bytes_written <
+             (record.comp_pos + block_header_size + comp_res[record.blk_pos].bytes_written <
               static_cast<size_t>(record.comp_size))) {
         record.pos -= compression_blocksize_;
-        record.comp_pos += block_header_size + comp_out[record.blk_pos].bytes_written;
+        record.comp_pos += block_header_size + comp_res[record.blk_pos].bytes_written;
         record.blk_pos += 1;
       }
     }
@@ -2007,20 +2052,12 @@ __global__ void copy_string_data(char* string_pool,
   }
 }
 
-auto to_nvcomp_compression_type(CompressionKind compression_kind)
-{
-  if (compression_kind == SNAPPY) return nvcomp::compression_type::SNAPPY;
-  if (compression_kind == ZLIB) return nvcomp::compression_type::DEFLATE;
-  CUDF_FAIL("Unsupported compression type");
-}
-
-size_t get_compress_max_output_chunk_size(CompressionKind compression_kind,
-                                          uint32_t compression_blocksize)
+size_t max_compression_output_size(CompressionKind compression_kind, uint32_t compression_blocksize)
 {
   if (compression_kind == NONE) return 0;
 
-  return batched_compress_get_max_output_chunk_size(to_nvcomp_compression_type(compression_kind),
-                                                    compression_blocksize);
+  return compress_max_output_chunk_size(to_nvcomp_compression_type(compression_kind),
+                                        compression_blocksize);
 }
 
 void writer::impl::persisted_statistics::persist(int num_table_rows,
@@ -2124,10 +2161,16 @@ void writer::impl::write(table_view const& table)
 
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
 
+  auto const uncomp_block_align = uncomp_block_alignment(compression_kind_);
   auto streams =
     create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes));
-  auto enc_data = encode_columns(
-    orc_table, std::move(dictionaries), std::move(dec_chunk_sizes), segmentation, streams, stream);
+  auto enc_data = encode_columns(orc_table,
+                                 std::move(dictionaries),
+                                 std::move(dec_chunk_sizes),
+                                 segmentation,
+                                 streams,
+                                 uncomp_block_align,
+                                 stream);
 
   // Assemble individual disparate column chunks into contiguous data streams
   size_type const num_index_streams = (orc_table.num_columns() + 1);
@@ -2140,8 +2183,13 @@ void writer::impl::write(table_view const& table)
     // Allocate intermediate output stream buffer
     size_t compressed_bfr_size   = 0;
     size_t num_compressed_blocks = 0;
+
     auto const max_compressed_block_size =
-      get_compress_max_output_chunk_size(compression_kind_, compression_blocksize_);
+      max_compression_output_size(compression_kind_, compression_blocksize_);
+    auto const padded_max_compressed_block_size =
+      util::round_up_unsafe<size_t>(max_compressed_block_size, uncomp_block_align);
+    auto const padded_block_header_size =
+      util::round_up_unsafe<size_t>(block_header_size, uncomp_block_align);
 
     auto stream_output = [&]() {
       size_t max_stream_size = 0;
@@ -2158,7 +2206,8 @@ void writer::impl::write(table_view const& table)
             (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
           stream_size += num_blocks * block_header_size;
           num_compressed_blocks += num_blocks;
-          compressed_bfr_size += compressed_block_size(max_compressed_block_size) * num_blocks;
+          compressed_bfr_size +=
+            (padded_block_header_size + padded_max_compressed_block_size) * num_blocks;
         }
         max_stream_size = std::max(max_stream_size, stream_size);
       }
@@ -2177,9 +2226,11 @@ void writer::impl::write(table_view const& table)
 
     // Compress the data streams
     rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-    hostdevice_vector<device_span<uint8_t const>> comp_in(num_compressed_blocks, stream);
-    hostdevice_vector<device_span<uint8_t>> comp_out(num_compressed_blocks, stream);
-    hostdevice_vector<decompress_status> comp_stats(num_compressed_blocks, stream);
+    hostdevice_vector<compression_result> comp_results(num_compressed_blocks, stream);
+    thrust::fill(rmm::exec_policy(stream),
+                 comp_results.d_begin(),
+                 comp_results.d_end(),
+                 compression_result{0, compression_status::FAILURE});
     if (compression_kind_ != NONE) {
       strm_descs.host_to_device(stream);
       gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
@@ -2187,14 +2238,13 @@ void writer::impl::write(table_view const& table)
                                   compression_kind_,
                                   compression_blocksize_,
                                   max_compressed_block_size,
+                                  comp_block_alignment(compression_kind_),
                                   strm_descs,
                                   enc_data.streams,
-                                  comp_in,
-                                  comp_out,
-                                  comp_stats,
+                                  comp_results,
                                   stream);
       strm_descs.device_to_host(stream);
-      comp_stats.device_to_host(stream, true);
+      comp_results.device_to_host(stream, true);
     }
 
     ProtobufWriter pbw_(&buffer_);
@@ -2221,7 +2271,7 @@ void writer::impl::write(table_view const& table)
                            segmentation,
                            enc_data.streams,
                            strm_descs,
-                           comp_stats,
+                           comp_results,
                            intermediate_stats.rowgroup_blobs,
                            &stripe,
                            &streams,
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index ed360a77632..dc8aad33af0 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -390,7 +390,7 @@ class writer::impl {
                           file_segmentation const& segmentation,
                           host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
                           host_2dspan<gpu::StripeStream const> strm_desc,
-                          host_span<decompress_status const> comp_out,
+                          host_span<compression_result const> comp_out,
                           std::vector<ColStatsBlob> const& rg_stats,
                           StripeInformation* stripe,
                           orc_streams* streams,
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index f06488671c3..77984ee3c27 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -228,7 +228,8 @@ __global__ void __launch_bounds__(128)
                statistics_merge_group* chunk_grstats,
                int32_t num_columns,
                size_t max_page_size_bytes,
-               size_type max_page_size_rows)
+               size_type max_page_size_rows,
+               uint32_t page_align)
 {
   // TODO: All writing seems to be done by thread 0. Could be replaced by thrust foreach
   __shared__ __align__(8) parquet_column_device_view col_g;
@@ -284,7 +285,8 @@ __global__ void __launch_bounds__(128)
         page_g.num_rows        = ck_g.num_dict_entries;
         page_g.num_leaf_values = ck_g.num_dict_entries;
         page_g.num_values      = ck_g.num_dict_entries;  // TODO: shouldn't matter for dict page
-        page_offset += page_g.max_hdr_size + page_g.max_data_size;
+        page_offset +=
+          util::round_up_unsafe(page_g.max_hdr_size + page_g.max_data_size, page_align);
         if (not comp_page_sizes.empty()) {
           comp_page_offset += page_g.max_hdr_size + comp_page_sizes[ck_g.first_page];
         }
@@ -360,7 +362,8 @@ __global__ void __launch_bounds__(128)
             }
             page_g.max_hdr_size += stats_hdr_len;
           }
-          page_g.page_data = ck_g.uncompressed_bfr + page_offset;
+          page_g.max_hdr_size = util::round_up_unsafe(page_g.max_hdr_size, page_align);
+          page_g.page_data    = ck_g.uncompressed_bfr + page_offset;
           if (not comp_page_sizes.empty()) {
             page_g.compressed_data = ck_g.compressed_bfr + comp_page_offset;
           }
@@ -384,7 +387,8 @@ __global__ void __launch_bounds__(128)
 
           pagestats_g.start_chunk = ck_g.first_fragment + page_start;
           pagestats_g.num_chunks  = page_g.num_fragments;
-          page_offset += page_g.max_hdr_size + page_g.max_data_size;
+          page_offset +=
+            util::round_up_unsafe(page_g.max_hdr_size + page_g.max_data_size, page_align);
           if (not comp_page_sizes.empty()) {
             comp_page_offset += page_g.max_hdr_size + comp_page_sizes[ck_g.first_page + num_pages];
           }
@@ -422,7 +426,7 @@ __global__ void __launch_bounds__(128)
     __syncwarp();
     if (!t) {
       if (ck_g.ck_stat_size == 0 && ck_g.stats) {
-        uint32_t ck_stat_size = 48 + 2 * ck_max_stats_len;
+        uint32_t ck_stat_size = util::round_up_unsafe(48 + 2 * ck_max_stats_len, page_align);
         page_offset += ck_stat_size;
         comp_page_offset += ck_stat_size;
         ck_g.ck_stat_size = ck_stat_size;
@@ -866,7 +870,7 @@ __global__ void __launch_bounds__(128, 8)
   gpuEncodePages(device_span<gpu::EncPage> pages,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
-                 device_span<decompress_status> comp_stats)
+                 device_span<compression_result> comp_results)
 {
   __shared__ __align__(8) page_enc_state_s state_g;
   using block_scan = cub::BlockScan<uint32_t, block_size>;
@@ -1213,18 +1217,17 @@ __global__ void __launch_bounds__(128, 8)
     }
   }
   if (t == 0) {
-    uint8_t* base                = s->page.page_data + s->page.max_hdr_size;
-    auto actual_data_size        = static_cast<uint32_t>(s->cur - base);
-    uint32_t compressed_bfr_size = GetMaxCompressedBfrSize(actual_data_size);
-    s->page.max_data_size        = actual_data_size;
+    uint8_t* base         = s->page.page_data + s->page.max_hdr_size;
+    auto actual_data_size = static_cast<uint32_t>(s->cur - base);
+    s->page.max_data_size = actual_data_size;
     if (not comp_in.empty()) {
       comp_in[blockIdx.x]  = {base, actual_data_size};
-      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, compressed_bfr_size};
+      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, 0};  // size is unused
     }
     pages[blockIdx.x] = s->page;
-    if (not comp_stats.empty()) {
-      comp_stats[blockIdx.x]      = {0, ~0u};
-      pages[blockIdx.x].comp_stat = &comp_stats[blockIdx.x];
+    if (not comp_results.empty()) {
+      comp_results[blockIdx.x]   = {0, compression_status::FAILURE};
+      pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
     }
   }
 }
@@ -1257,10 +1260,10 @@ __global__ void __launch_bounds__(128) gpuDecideCompression(device_span<EncColum
       auto& curr_page         = ck_g.pages[page];
       uint32_t page_data_size = curr_page.max_data_size;
       uncompressed_data_size += page_data_size;
-      if (auto comp_status = curr_page.comp_stat; comp_status != nullptr) {
+      if (auto comp_res = curr_page.comp_res; comp_res != nullptr) {
         has_compression = true;
-        compressed_data_size += comp_status->bytes_written;
-        if (comp_status->status != 0) { atomicAdd(&error_count, 1); }
+        compressed_data_size += comp_res->bytes_written;
+        if (comp_res->status != compression_status::SUCCESS) { atomicAdd(&error_count, 1); }
       }
     }
     uncompressed_data_size = warp_reduce(temp_storage[0]).Sum(uncompressed_data_size);
@@ -1677,7 +1680,7 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start,
 // blockDim(128, 1, 1)
 __global__ void __launch_bounds__(128)
   gpuEncodePageHeaders(device_span<EncPage> pages,
-                       device_span<decompress_status const> comp_stat,
+                       device_span<compression_result const> comp_results,
                        device_span<statistics_chunk const> page_stats,
                        const statistics_chunk* chunk_stats)
 {
@@ -1706,7 +1709,7 @@ __global__ void __launch_bounds__(128)
     uncompressed_page_size = page_g.max_data_size;
     if (ck_g.is_compressed) {
       hdr_start            = page_g.compressed_data;
-      compressed_page_size = (uint32_t)comp_stat[blockIdx.x].bytes_written;
+      compressed_page_size = (uint32_t)comp_results[blockIdx.x].bytes_written;
       page_g.max_data_size = compressed_page_size;
     } else {
       hdr_start            = page_g.page_data;
@@ -2041,6 +2044,7 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                       int32_t num_columns,
                       size_t max_page_size_bytes,
                       size_type max_page_size_rows,
+                      uint32_t page_align,
                       statistics_merge_group* page_grstats,
                       statistics_merge_group* chunk_grstats,
                       rmm::cuda_stream_view stream)
@@ -2056,19 +2060,21 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                                                      chunk_grstats,
                                                      num_columns,
                                                      max_page_size_bytes,
-                                                     max_page_size_rows);
+                                                     max_page_size_rows,
+                                                     page_align);
 }
 
 void EncodePages(device_span<gpu::EncPage> pages,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
-                 device_span<decompress_status> comp_stats,
+                 device_span<compression_result> comp_results,
                  rmm::cuda_stream_view stream)
 {
   auto num_pages = pages.size();
   // A page is part of one column. This is launching 1 block per page. 1 block will exclusively
   // deal with one datatype.
-  gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(pages, comp_in, comp_out, comp_stats);
+  gpuEncodePages<128>
+    <<<num_pages, 128, 0, stream.value()>>>(pages, comp_in, comp_out, comp_results);
 }
 
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
@@ -2077,7 +2083,7 @@ void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view
 }
 
 void EncodePageHeaders(device_span<EncPage> pages,
-                       device_span<decompress_status const> comp_stats,
+                       device_span<compression_result const> comp_results,
                        device_span<statistics_chunk const> page_stats,
                        const statistics_chunk* chunk_stats,
                        rmm::cuda_stream_view stream)
@@ -2085,7 +2091,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
   // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the
   // threads to coop load structs
   gpuEncodePageHeaders<<<pages.size(), 128, 0, stream.value()>>>(
-    pages, comp_stats, page_stats, chunk_stats);
+    pages, comp_results, page_stats, chunk_stats);
 }
 
 void GatherPages(device_span<EncColumnChunk> chunks,
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 610275ee26b..d0d367df962 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -322,15 +322,6 @@ inline size_type __device__ row_to_value_idx(size_type idx,
   return idx;
 }
 
-/**
- * @brief Return worst-case compressed size of compressed data given the uncompressed size
- */
-inline size_t __device__ __host__ GetMaxCompressedBfrSize(size_t uncomp_size,
-                                                          uint32_t num_pages = 1)
-{
-  return uncomp_size + (uncomp_size >> 7) + num_pages * 8;
-}
-
 struct EncPage;
 
 /**
@@ -389,7 +380,7 @@ struct EncPage {
   uint32_t num_leaf_values;  //!< Values in page. Different from num_rows in case of nested types
   uint32_t num_values;  //!< Number of def/rep level values in page. Includes null/empty elements in
                         //!< non-leaf levels
-  decompress_status* comp_stat;  //!< Ptr to compression status
+  compression_result* comp_res;  //!< Ptr to compression result
 };
 
 /**
@@ -544,6 +535,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const>
  * @param[in] num_rowgroups Number of fragments per column
  * @param[in] num_columns Number of columns
  * @param[in] page_grstats Setup for page-level stats
+ * @param[in] page_align Required alignment for uncompressed pages
  * @param[in] chunk_grstats Setup for chunk-level stats
  * @param[in] max_page_comp_data_size Calculated maximum compressed data size of pages
  * @param[in] stream CUDA stream to use, default 0
@@ -556,6 +548,7 @@ void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
                       int32_t num_columns,
                       size_t max_page_size_bytes,
                       size_type max_page_size_rows,
+                      uint32_t page_align,
                       statistics_merge_group* page_grstats,
                       statistics_merge_group* chunk_grstats,
                       rmm::cuda_stream_view stream);
@@ -566,13 +559,13 @@ void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
  * @param[in,out] pages Device array of EncPages (unordered)
  * @param[out] comp_in Compressor input buffers
  * @param[out] comp_in Compressor output buffers
- * @param[out] comp_stats Compressor statuses
+ * @param[out] comp_stats Compressor results
  * @param[in] stream CUDA stream to use, default 0
  */
 void EncodePages(device_span<EncPage> pages,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
-                 device_span<decompress_status> comp_stats,
+                 device_span<compression_result> comp_res,
                  rmm::cuda_stream_view stream);
 
 /**
@@ -593,7 +586,7 @@ void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view
  * @param[in] stream CUDA stream to use, default 0
  */
 void EncodePageHeaders(device_span<EncPage> pages,
-                       device_span<decompress_status const> comp_stats,
+                       device_span<compression_result const> comp_res,
                        device_span<statistics_chunk const> page_stats,
                        const statistics_chunk* chunk_stats,
                        rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 2553b375e72..59bef6f5600 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -247,13 +247,15 @@ std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
   return std::make_tuple(type_width, clock_rate, converted_type);
 }
 
-inline void decompress_check(device_span<decompress_status const> stats,
+inline void decompress_check(device_span<compression_result const> results,
                              rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
-                              stats.begin(),
-                              stats.end(),
-                              [] __device__(auto const& stat) { return stat.status == 0; }),
+                              results.begin(),
+                              results.end(),
+                              [] __device__(auto const& res) {
+                                return res.status == compression_status::SUCCESS;
+                              }),
                "Error during decompression");
 }
 }  // namespace
@@ -1149,11 +1151,11 @@ rmm::device_buffer reader::impl::decompress_page_data(
   std::vector<device_span<uint8_t>> comp_out;
   comp_out.reserve(num_comp_pages);
 
-  rmm::device_uvector<decompress_status> comp_stats(num_comp_pages, _stream);
+  rmm::device_uvector<compression_result> comp_res(num_comp_pages, _stream);
   thrust::fill(rmm::exec_policy(_stream),
-               comp_stats.begin(),
-               comp_stats.end(),
-               decompress_status{0, static_cast<uint32_t>(-1000), 0});
+               comp_res.begin(),
+               comp_res.end(),
+               compression_result{0, compression_status::FAILURE});
 
   size_t decomp_offset = 0;
   int32_t start_pos    = 0;
@@ -1177,31 +1179,30 @@ rmm::device_buffer reader::impl::decompress_page_data(
     host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
                                                         codec.num_pages);
     auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, _stream);
-    device_span<decompress_status> d_comp_stats_view(comp_stats.data() + start_pos,
-                                                     codec.num_pages);
+    device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
       case parquet::GZIP:
-        gpuinflate(d_comp_in, d_comp_out, d_comp_stats_view, gzip_header_included::YES, _stream);
+        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, _stream);
         break;
       case parquet::SNAPPY:
         if (nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
                                      d_comp_in,
                                      d_comp_out,
-                                     d_comp_stats_view,
+                                     d_comp_res_view,
                                      codec.max_decompressed_size,
                                      codec.total_decomp_size,
                                      _stream);
         } else {
-          gpu_unsnap(d_comp_in, d_comp_out, d_comp_stats_view, _stream);
+          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, _stream);
         }
         break;
       case parquet::ZSTD:
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
                                    d_comp_in,
                                    d_comp_out,
-                                   d_comp_stats_view,
+                                   d_comp_res_view,
                                    codec.max_decompressed_size,
                                    codec.total_decomp_size,
                                    _stream);
@@ -1209,7 +1210,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
       case parquet::BROTLI:
         gpu_debrotli(d_comp_in,
                      d_comp_out,
-                     d_comp_stats_view,
+                     d_comp_res_view,
                      debrotli_scratch.data(),
                      debrotli_scratch.size(),
                      _stream);
@@ -1219,7 +1220,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
     start_pos += codec.num_pages;
   }
 
-  decompress_check(comp_stats, _stream);
+  decompress_check(comp_res, _stream);
 
   // Update the page information in device memory with the updated value of
   // page_data; it now points to the uncompressed data buffer
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 58910420173..2bfd7c1ba4d 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -24,6 +24,7 @@
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
 
+#include <io/comp/nvcomp_adapter.hpp>
 #include <io/statistics/column_statistics.cuh>
 #include <io/utilities/column_utils.cuh>
 #include <io/utilities/config_utils.hpp>
@@ -43,8 +44,6 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <nvcomp/snappy.h>
-
 #include <thrust/binary_search.h>
 #include <thrust/for_each.h>
 #include <thrust/host_vector.h>
@@ -79,6 +78,7 @@ parquet::Compression to_parquet_compression(compression_type compression)
   switch (compression) {
     case compression_type::AUTO:
     case compression_type::SNAPPY: return parquet::Compression::SNAPPY;
+    case compression_type::ZSTD: return parquet::Compression::ZSTD;
     case compression_type::NONE: return parquet::Compression::UNCOMPRESSED;
     default: CUDF_FAIL("Unsupported compression type");
   }
@@ -907,11 +907,36 @@ void writer::impl::gather_fragment_statistics(
   stream.synchronize();
 }
 
+auto to_nvcomp_compression_type(Compression codec)
+{
+  if (codec == Compression::SNAPPY) return nvcomp::compression_type::SNAPPY;
+  if (codec == Compression::ZSTD) return nvcomp::compression_type::ZSTD;
+  CUDF_FAIL("Unsupported compression type");
+}
+
+auto page_alignment(Compression codec)
+{
+  if (codec == Compression::UNCOMPRESSED or
+      not nvcomp::is_compression_enabled(to_nvcomp_compression_type(codec))) {
+    return 1u;
+  }
+
+  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec));
+}
+
+size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
+{
+  if (codec == Compression::UNCOMPRESSED) return 0;
+
+  return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
+}
+
 auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                      device_span<gpu::parquet_column_device_view const> col_desc,
                      uint32_t num_columns,
                      size_t max_page_size_bytes,
                      size_type max_page_size_rows,
+                     Compression compression_codec,
                      rmm::cuda_stream_view stream)
 {
   if (chunks.is_empty()) { return hostdevice_vector<size_type>{}; }
@@ -926,6 +951,7 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                         num_columns,
                         max_page_size_bytes,
                         max_page_size_rows,
+                        page_alignment(compression_codec),
                         nullptr,
                         nullptr,
                         stream);
@@ -949,6 +975,7 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                         num_columns,
                         max_page_size_bytes,
                         max_page_size_rows,
+                        page_alignment(compression_codec),
                         nullptr,
                         nullptr,
                         stream);
@@ -956,12 +983,12 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
   // Get per-page max compressed size
   hostdevice_vector<size_type> comp_page_sizes(num_pages, stream);
-  std::transform(page_sizes.begin(), page_sizes.end(), comp_page_sizes.begin(), [](auto page_size) {
-    size_t page_comp_max_size = 0;
-    nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-      page_size, nvcompBatchedSnappyDefaultOpts, &page_comp_max_size);
-    return page_comp_max_size;
-  });
+  std::transform(page_sizes.begin(),
+                 page_sizes.end(),
+                 comp_page_sizes.begin(),
+                 [compression_codec](auto page_size) {
+                   return max_compression_output_size(compression_codec, page_size);
+                 });
   comp_page_sizes.host_to_device(stream);
 
   // Use per-page max compressed size to calculate chunk.compressed_size
@@ -973,6 +1000,7 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                         num_columns,
                         max_page_size_bytes,
                         max_page_size_rows,
+                        page_alignment(compression_codec),
                         nullptr,
                         nullptr,
                         stream);
@@ -1091,6 +1119,7 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>&
                    num_columns,
                    max_page_size_bytes,
                    max_page_size_rows,
+                   page_alignment(compression_),
                    (num_stats_bfr) ? page_stats_mrg.data() : nullptr,
                    (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr,
                    stream);
@@ -1109,83 +1138,6 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>&
   stream.synchronize();
 }
 
-void snappy_compress(device_span<device_span<uint8_t const> const> comp_in,
-                     device_span<device_span<uint8_t> const> comp_out,
-                     device_span<decompress_status> comp_stats,
-                     size_t max_page_uncomp_data_size,
-                     rmm::cuda_stream_view stream)
-{
-  size_t num_comp_pages = comp_in.size();
-  try {
-    size_t temp_size;
-    nvcompStatus_t nvcomp_status = nvcompBatchedSnappyCompressGetTempSize(
-      num_comp_pages, max_page_uncomp_data_size, nvcompBatchedSnappyDefaultOpts, &temp_size);
-
-    CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-                 "Error in getting snappy compression scratch size");
-
-    // Not needed now but nvcomp API makes no promises about future
-    rmm::device_buffer scratch(temp_size, stream);
-    // Analogous to comp_in.srcDevice
-    rmm::device_uvector<void const*> uncompressed_data_ptrs(num_comp_pages, stream);
-    // Analogous to comp_in.srcSize
-    rmm::device_uvector<size_t> uncompressed_data_sizes(num_comp_pages, stream);
-    // Analogous to comp_in.dstDevice
-    rmm::device_uvector<void*> compressed_data_ptrs(num_comp_pages, stream);
-    // Analogous to comp_stat.bytes_written
-    rmm::device_uvector<size_t> compressed_bytes_written(num_comp_pages, stream);
-    // nvcomp does not currently use comp_in.dstSize. Cannot assume that the output will fit in
-    // the space allocated unless one uses the API nvcompBatchedSnappyCompressGetOutputSize()
-
-    // Prepare the vectors
-    auto comp_it =
-      thrust::make_zip_iterator(uncompressed_data_ptrs.begin(), uncompressed_data_sizes.begin());
-    thrust::transform(
-      rmm::exec_policy(stream),
-      comp_in.begin(),
-      comp_in.end(),
-      comp_it,
-      [] __device__(auto const& in) { return thrust::make_tuple(in.data(), in.size()); });
-
-    thrust::transform(rmm::exec_policy(stream),
-                      comp_out.begin(),
-                      comp_out.end(),
-                      compressed_data_ptrs.begin(),
-                      [] __device__(auto const& out) { return out.data(); });
-    nvcomp_status = nvcompBatchedSnappyCompressAsync(uncompressed_data_ptrs.data(),
-                                                     uncompressed_data_sizes.data(),
-                                                     max_page_uncomp_data_size,
-                                                     num_comp_pages,
-                                                     scratch.data(),  // Not needed rn but future
-                                                     scratch.size(),
-                                                     compressed_data_ptrs.data(),
-                                                     compressed_bytes_written.data(),
-                                                     nvcompBatchedSnappyDefaultOpts,
-                                                     stream.value());
-    CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Error in snappy compression");
-
-    // nvcomp also doesn't use comp_out.status . It guarantees that given enough output space,
-    // compression will succeed.
-    // The other `comp_out` field is `reserved` which is for internal cuIO debugging and can be 0.
-    thrust::transform(rmm::exec_policy(stream),
-                      compressed_bytes_written.begin(),
-                      compressed_bytes_written.end(),
-                      comp_stats.begin(),
-                      [] __device__(size_t size) {
-                        decompress_status status{};
-                        status.bytes_written = size;
-                        return status;
-                      });
-    return;
-  } catch (...) {
-    // If we reach this then there was an error in compressing so set an error status for each page
-    thrust::for_each(rmm::exec_policy(stream),
-                     comp_stats.begin(),
-                     comp_stats.end(),
-                     [] __device__(decompress_status & stat) { stat.status = 1; });
-  };
-}
-
 void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                                 device_span<gpu::EncPage> pages,
                                 size_t max_page_uncomp_data_size,
@@ -1209,24 +1161,37 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
 
   rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
-  rmm::device_uvector<decompress_status> comp_stats(max_comp_pages, stream);
+  rmm::device_uvector<compression_result> comp_res(max_comp_pages, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               comp_res.begin(),
+               comp_res.end(),
+               compression_result{0, compression_status::FAILURE});
 
-  gpu::EncodePages(batch_pages, comp_in, comp_out, comp_stats, stream);
+  gpu::EncodePages(batch_pages, comp_in, comp_out, comp_res, stream);
   switch (compression_) {
     case parquet::Compression::SNAPPY:
-      if (nvcomp_integration::is_stable_enabled()) {
-        snappy_compress(comp_in, comp_out, comp_stats, max_page_uncomp_data_size, stream);
+      if (nvcomp::is_compression_enabled(nvcomp::compression_type::SNAPPY)) {
+        nvcomp::batched_compress(
+          nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
       } else {
-        gpu_snap(comp_in, comp_out, comp_stats, stream);
+        gpu_snap(comp_in, comp_out, comp_res, stream);
+      }
+      break;
+    case parquet::Compression::ZSTD:
+      if (nvcomp::is_compression_enabled(nvcomp::compression_type::ZSTD)) {
+        nvcomp::batched_compress(
+          nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
       }
       break;
-    default: break;
+    case parquet::Compression::UNCOMPRESSED: break;
+    default: CUDF_FAIL("invalid compression type");
   }
+
   // TBD: Not clear if the official spec actually allows dynamically turning off compression at the
   // chunk-level
   auto d_chunks_in_batch = chunks.device_view().subspan(first_rowgroup, rowgroups_in_batch);
   DecideCompression(d_chunks_in_batch.flat_view(), stream);
-  EncodePageHeaders(batch_pages, comp_stats, batch_pages_stats, chunk_stats, stream);
+  EncodePageHeaders(batch_pages, comp_res, batch_pages_stats, chunk_stats, stream);
   GatherPages(d_chunks_in_batch.flat_view(), pages, stream);
 
   if (column_stats != nullptr) {
@@ -1274,6 +1239,18 @@ size_t writer::impl::column_index_buffer_size(gpu::EncColumnChunk* ck) const
   return ck->ck_stat_size * ck->num_pages + column_index_truncate_length + padding;
 }
 
+size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
+{
+  if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; }
+
+  auto const ncomp_type   = to_nvcomp_compression_type(compression);
+  auto const nvcomp_limit = nvcomp::is_compression_enabled(ncomp_type)
+                              ? nvcomp::compress_max_allowed_chunk_size(ncomp_type)
+                              : std::nullopt;
+
+  return std::min(nvcomp_limit.value_or(max_page_size_bytes), max_page_size_bytes);
+}
+
 writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    parquet_writer_options const& options,
                    SingleWriteMode mode,
@@ -1281,11 +1258,11 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
+    compression_(to_parquet_compression(options.get_compression())),
     max_row_group_size{options.get_row_group_size_bytes()},
     max_row_group_rows{options.get_row_group_size_rows()},
-    max_page_size_bytes(options.get_max_page_size_bytes()),
+    max_page_size_bytes(max_page_bytes(compression_, options.get_max_page_size_bytes())),
     max_page_size_rows(options.get_max_page_size_rows()),
-    compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
     column_index_truncate_length(options.get_column_index_truncate_length()),
@@ -1306,11 +1283,11 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
+    compression_(to_parquet_compression(options.get_compression())),
     max_row_group_size{options.get_row_group_size_bytes()},
     max_row_group_rows{options.get_row_group_size_rows()},
-    max_page_size_bytes(options.get_max_page_size_bytes()),
+    max_page_size_bytes(max_page_bytes(compression_, options.get_max_page_size_bytes())),
     max_page_size_rows(options.get_max_page_size_rows()),
-    compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
     column_index_truncate_length(options.get_column_index_truncate_length()),
@@ -1405,13 +1382,15 @@ void writer::impl::write(table_view const& table, std::vector<partition_info> co
   // iteratively reduce this value if the largest fragment exceeds the max page size limit (we
   // ideally want the page size to be below 1MB so as to have enough pages to get good
   // compression/decompression performance).
-  using cudf::io::parquet::gpu::max_page_fragment_size;
+  auto max_page_fragment_size =
+    (cudf::io::parquet::gpu::max_page_fragment_size * max_page_size_bytes) /
+    default_max_page_size_bytes;
 
   std::vector<int> num_frag_in_part;
   std::transform(partitions.begin(),
                  partitions.end(),
                  std::back_inserter(num_frag_in_part),
-                 [](auto const& part) {
+                 [max_page_fragment_size](auto const& part) {
                    return util::div_rounding_up_unsafe(part.num_rows, max_page_fragment_size);
                  });
 
@@ -1561,8 +1540,8 @@ void writer::impl::write(table_view const& table, std::vector<partition_info> co
   }
 
   // Build chunk dictionaries and count pages
-  hostdevice_vector<size_type> comp_page_sizes =
-    init_page_sizes(chunks, col_desc, num_columns, max_page_size_bytes, max_page_size_rows, stream);
+  hostdevice_vector<size_type> comp_page_sizes = init_page_sizes(
+    chunks, col_desc, num_columns, max_page_size_bytes, max_page_size_rows, compression_, stream);
 
   // Get the maximum page size across all chunks
   size_type max_page_uncomp_data_size =
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index c6309488d6b..cac75a5dcd9 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -208,11 +208,11 @@ class writer::impl {
   // Cuda stream to be used
   rmm::cuda_stream_view stream;
 
+  Compression compression_               = Compression::UNCOMPRESSED;
   size_t max_row_group_size              = default_row_group_size_bytes;
   size_type max_row_group_rows           = default_row_group_size_rows;
   size_t max_page_size_bytes             = default_max_page_size_bytes;
   size_type max_page_size_rows           = default_max_page_size_rows;
-  Compression compression_               = Compression::UNCOMPRESSED;
   statistics_freq stats_granularity_     = statistics_freq::STATISTICS_NONE;
   bool int96_timestamps                  = false;
   size_type column_index_truncate_length = default_column_index_truncate_length;
diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
index 134f262cb13..c51a7854e25 100644
--- a/cpp/tests/io/comp/decomp_test.cpp
+++ b/cpp/tests/io/comp/decomp_test.cpp
@@ -58,7 +58,7 @@ struct DecompressTest : public cudf::test::BaseFixture {
     inf_out[0] = dst;
     inf_out.host_to_device(stream);
 
-    hostdevice_vector<cudf::io::decompress_status> inf_stat(1, stream);
+    hostdevice_vector<cudf::io::compression_result> inf_stat(1, stream);
     inf_stat[0] = {};
     inf_stat.host_to_device(stream);
 
@@ -66,7 +66,7 @@ struct DecompressTest : public cudf::test::BaseFixture {
     cudaMemcpyAsync(
       decompressed->data(), dst.data(), dst.size(), cudaMemcpyDeviceToHost, stream.value());
     inf_stat.device_to_host(stream, true);
-    ASSERT_EQ(inf_stat[0].status, 0);
+    ASSERT_EQ(inf_stat[0].status, cudf::io::compression_status::SUCCESS);
   }
 };
 
@@ -76,7 +76,7 @@ struct DecompressTest : public cudf::test::BaseFixture {
 struct GzipDecompressTest : public DecompressTest<GzipDecompressTest> {
   void dispatch(device_span<device_span<uint8_t const>> d_inf_in,
                 device_span<device_span<uint8_t>> d_inf_out,
-                device_span<cudf::io::decompress_status> d_inf_stat)
+                device_span<cudf::io::compression_result> d_inf_stat)
   {
     cudf::io::gpuinflate(d_inf_in,
                          d_inf_out,
@@ -92,7 +92,7 @@ struct GzipDecompressTest : public DecompressTest<GzipDecompressTest> {
 struct SnappyDecompressTest : public DecompressTest<SnappyDecompressTest> {
   void dispatch(device_span<device_span<uint8_t const>> d_inf_in,
                 device_span<device_span<uint8_t>> d_inf_out,
-                device_span<cudf::io::decompress_status> d_inf_stat)
+                device_span<cudf::io::compression_result> d_inf_stat)
   {
     cudf::io::gpu_unsnap(d_inf_in, d_inf_out, d_inf_stat, cudf::default_stream_value);
   }
@@ -104,7 +104,7 @@ struct SnappyDecompressTest : public DecompressTest<SnappyDecompressTest> {
 struct BrotliDecompressTest : public DecompressTest<BrotliDecompressTest> {
   void dispatch(device_span<device_span<uint8_t const>> d_inf_in,
                 device_span<device_span<uint8_t>> d_inf_out,
-                device_span<cudf::io::decompress_status> d_inf_stat)
+                device_span<cudf::io::compression_result> d_inf_stat)
   {
     rmm::device_buffer d_scratch{cudf::io::get_gpu_debrotli_scratch_size(1),
                                  cudf::default_stream_value};
diff --git a/java/src/main/java/ai/rapids/cudf/CompressionType.java b/java/src/main/java/ai/rapids/cudf/CompressionType.java
index 48f980d7f71..96edf1a8add 100644
--- a/java/src/main/java/ai/rapids/cudf/CompressionType.java
+++ b/java/src/main/java/ai/rapids/cudf/CompressionType.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -44,11 +44,21 @@ public enum CompressionType {
   ZIP(6),
 
   /** XZ format using LZMA(2) algorithm */
-  XZ(7);
+  XZ(7),
+
+  /** ZLIB format, using DEFLATE algorithm */
+  ZLIB(8),
+
+  /** LZ4 format, using LZ77 */
+  LZ4(9),
+
+  /** Lempel–Ziv–Oberhumer format */
+  LZO(10),
+
+  /** Zstandard format */
+  ZSTD(11);
 
   final int nativeId;
 
-  CompressionType(int nativeId) {
-    this.nativeId = nativeId;
-  }
+  CompressionType(int nativeId) { this.nativeId = nativeId; }
 }
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 66b841fd273..1c9f388873c 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -163,6 +163,8 @@ cdef compression_type _get_comp_type(object compression):
         return compression_type.SNAPPY
     elif compression == "ZLIB":
         return compression_type.ZLIB
+    elif compression == "ZSTD":
+        return compression_type.ZSTD
     else:
         raise ValueError(f"Unsupported `compression` type {compression}")
 
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 84e0bba7133..3c8e78bd87a 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -670,6 +670,8 @@ cdef cudf_io_types.compression_type _get_comp_type(object compression):
         return cudf_io_types.compression_type.NONE
     elif compression == "snappy":
         return cudf_io_types.compression_type.SNAPPY
+    elif compression == "ZSTD":
+        return cudf_io_types.compression_type.ZSTD
     else:
         raise ValueError("Unsupported `compression` type")
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index db52e51bd33..c2188003531 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1745,19 +1745,18 @@ def test_writer_protobuf_large_rowindexentry():
     assert_frame_equal(df, got)
 
 
-def test_orc_writer_zlib_compression(list_struct_buff):
+@pytest.mark.parametrize("compression", ["ZLIB", "ZSTD"])
+def test_orc_writer_nvcomp(list_struct_buff, compression):
     expected = cudf.read_orc(list_struct_buff)
+
+    buff = BytesIO()
     try:
-        # save with ZLIB compression
-        buff = BytesIO()
-        expected.to_orc(buff, compression="ZLIB")
-        got = cudf.read_orc(buff)
+        expected.to_orc(buff, compression=compression)
+    except RuntimeError:
+        pytest.mark.xfail(reason="Newer nvCOMP version is required")
+    else:
+        got = pd.read_orc(buff)
         assert_eq(expected, got)
-    except RuntimeError as e:
-        if "Unsupported compression type" in str(e):
-            pytest.mark.xfail(reason="nvcomp build doesn't have deflate")
-        else:
-            raise e
 
 
 @pytest.mark.parametrize("index", [True, False, None])
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 84d89618909..022f7cdd6f7 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2568,3 +2568,23 @@ def test_parquet_nested_struct_list():
     actual = cudf.read_parquet(buffer)
     assert_eq(expected, actual)
     assert_eq(actual.a.dtype, df.a.dtype)
+
+
+def test_parquet_writer_zstd():
+    size = 12345
+    expected = cudf.DataFrame(
+        {
+            "a": np.arange(0, stop=size, dtype="float64"),
+            "b": np.random.choice(list("abcd"), size=size),
+            "c": np.random.choice(np.arange(4), size=size),
+        }
+    )
+
+    buff = BytesIO()
+    try:
+        expected.to_orc(buff, compression="ZSTD")
+    except RuntimeError:
+        pytest.mark.xfail(reason="Newer nvCOMP version is required")
+    else:
+        got = pd.read_orc(buff)
+        assert_eq(expected, got)
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index a3bb81c6c24..19815c7c506 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -208,7 +208,7 @@
     File path or Root Directory path. Will be used as Root Directory path
     while writing a partitioned dataset. Use list of str with partition_offsets
     to write parts of the dataframe to different files.
-compression : {'snappy', None}, default 'snappy'
+compression : {'snappy', 'ZSTD', None}, default 'snappy'
     Name of the compression to use. Use ``None`` for no compression.
 index : bool, default None
     If ``True``, include the dataframe's index(es) in the file output. If
@@ -429,7 +429,7 @@
 ----------
 fname : str
     File path or object where the ORC dataset will be stored.
-compression : {{ 'snappy', 'ZLIB', None }}, default None
+compression : {{ 'snappy', 'ZLIB', 'ZSTD', None }}, default None
     Name of the compression to use. Use None for no compression.
 enable_statistics: boolean, default True
     Enable writing column statistics.

From d6952ba42bd815e8a54e02b907c3790ce26ea488 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 12 Sep 2022 15:16:22 -0700
Subject: [PATCH 13/25] Publish C++ developer docs (#11475)

This PR adds the developer documentation to our doxygen. The changes to enable this are minimal: the files have been moved from cpp/docs to cpp/doxygen and added to the `INPUT` section of the Doxyfile, and a custom `DoxygenLayout.xml` has been added to include the necessary header (note that this file was autogenerated with `doxygen -l` and the only modification was adding one tab for the developer guide). I added an anchor to the main developer guide header so that it can be linked to from the header.

Our current developer guide was written to be viewed on Github. As a result, it uses some [Github Flavored Markdown](https://github.github.com/gfm/) extensions, primarily around the way that code is displayed. Since doxygen does not support those, I had to make some modifications to the contents of the docs so that they would render the same way. Some care is needed around escaping the `@` symbol in the right locations in the docs to prevent doxygen from interpreting commands in examples. Finally due to https://github.com/doxygen/doxygen/issues/6054 we cannot put certain commands inside code blocks and get them rendered correctly. However, with a few cycles of rebuilding and checking the output I was able to get everything to look correct. These changes mean that the guide will no longer look as nice as before when viewed directly on Github. However, the goal of this PR is to allow everyone to move towards viewing the built and published documentation instead, so this isn't a problem.

This is how it looks now:

![image](https://user-images.githubusercontent.com/1538165/182971788-b946f406-f490-4698-a484-9046e37a4d88.png)

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11475
---
 cpp/doxygen/Doxyfile                          |   8 +-
 cpp/doxygen/DoxygenLayout.xml                 | 227 ++++++++
 .../developer_guide}/BENCHMARKING.md          |   0
 .../developer_guide}/DEVELOPER_GUIDE.md       |  55 +-
 .../developer_guide}/DOCUMENTATION.md         | 542 ++++++++----------
 .../developer_guide}/TESTING.md               |  77 +--
 .../developer_guide}/strings.png              | Bin
 cpp/doxygen/modify_fences.sh                  |   9 +
 8 files changed, 563 insertions(+), 355 deletions(-)
 create mode 100644 cpp/doxygen/DoxygenLayout.xml
 rename cpp/{docs => doxygen/developer_guide}/BENCHMARKING.md (100%)
 rename cpp/{docs => doxygen/developer_guide}/DEVELOPER_GUIDE.md (98%)
 rename cpp/{docs => doxygen/developer_guide}/DOCUMENTATION.md (50%)
 rename cpp/{docs => doxygen/developer_guide}/TESTING.md (92%)
 rename cpp/{docs => doxygen/developer_guide}/strings.png (100%)
 create mode 100755 cpp/doxygen/modify_fences.sh

diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 5f43f5af0e4..871632b053d 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -739,7 +739,7 @@ FILE_VERSION_FILTER    =
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
 # tag is left empty.
 
-LAYOUT_FILE            =
+LAYOUT_FILE            = DoxygenLayout.xml
 
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
@@ -830,6 +830,10 @@ WARN_LOGFILE           =
 INPUT                  = main_page.md \
                          regex.md \
                          unicode.md \
+                         developer_guide/BENCHMARKING.md \
+                         developer_guide/DOCUMENTATION.md \
+                         developer_guide/DEVELOPER_GUIDE.md \
+                         developer_guide/TESTING.md \
                          ../include \
                          ../include/cudf_test/column_wrapper.hpp \
                          ../include/cudf_test/column_utilities.hpp \
@@ -975,7 +979,7 @@ INPUT_FILTER           =
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # properly processed by doxygen.
 
-FILTER_PATTERNS        =
+FILTER_PATTERNS        = *.md=./modify_fences.sh
 
 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
 # INPUT_FILTER) will also be used to filter the input files that are used for
diff --git a/cpp/doxygen/DoxygenLayout.xml b/cpp/doxygen/DoxygenLayout.xml
new file mode 100644
index 00000000000..a78a1cb701f
--- /dev/null
+++ b/cpp/doxygen/DoxygenLayout.xml
@@ -0,0 +1,227 @@
+<doxygenlayout version="1.0">
+  <!-- Generated by doxygen 1.8.20 -->
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="mainpage" visible="yes" title=""/>
+    <tab type="pages" visible="yes" title="" intro=""/>
+    <tab type="user" url="@ref DEVELOPER_GUIDE" title="Developer Guide"/>
+    <tab type="modules" visible="yes" title="" intro=""/>
+    <tab type="namespaces" visible="yes" title="">
+      <tab type="namespacelist" visible="yes" title="" intro=""/>
+      <tab type="namespacemembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="interfaces" visible="yes" title="">
+      <tab type="interfacelist" visible="yes" title="" intro=""/>
+      <tab type="interfaceindex" visible="$ALPHABETICAL_INDEX" title=""/> 
+      <tab type="interfacehierarchy" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="classes" visible="yes" title="">
+      <tab type="classlist" visible="yes" title="" intro=""/>
+      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/> 
+      <tab type="hierarchy" visible="yes" title="" intro=""/>
+      <tab type="classmembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="structs" visible="yes" title="">
+      <tab type="structlist" visible="yes" title="" intro=""/>
+      <tab type="structindex" visible="$ALPHABETICAL_INDEX" title=""/> 
+    </tab>
+    <tab type="exceptions" visible="yes" title="">
+      <tab type="exceptionlist" visible="yes" title="" intro=""/>
+      <tab type="exceptionindex" visible="$ALPHABETICAL_INDEX" title=""/> 
+      <tab type="exceptionhierarchy" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="files" visible="yes" title="">
+      <tab type="filelist" visible="yes" title="" intro=""/>
+      <tab type="globals" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="examples" visible="yes" title="" intro=""/>  
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <allmemberslink visible="yes"/>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <interfaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <structs visible="yes" title=""/>
+      <exceptions visible="yes" title=""/>
+      <typedefs title=""/>
+      <sequences title=""/>
+      <dictionaries title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <sequences title=""/>
+      <dictionaries title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <interfaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <structs visible="yes" title=""/>
+      <exceptions visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <sequences title=""/>
+      <dictionaries title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <sequences title=""/>
+      <dictionaries title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <sequences title=""/>
+      <dictionaries title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <sequences title=""/>
+      <dictionaries title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/cpp/docs/BENCHMARKING.md b/cpp/doxygen/developer_guide/BENCHMARKING.md
similarity index 100%
rename from cpp/docs/BENCHMARKING.md
rename to cpp/doxygen/developer_guide/BENCHMARKING.md
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
similarity index 98%
rename from cpp/docs/DEVELOPER_GUIDE.md
rename to cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 33dd341a7e8..b3774aeda38 100644
--- a/cpp/docs/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1,4 +1,4 @@
-# libcudf C++ Developer Guide
+# libcudf C++ Developer Guide {#DEVELOPER_GUIDE}
 
 This document serves as a guide for contributors to libcudf C++ code. Developers should also refer
 to these additional files for further documentation of libcudf best practices.
@@ -185,7 +185,7 @@ and produce `unique_ptr`s to owning objects as output. For example,
 std::unique_ptr<table> sort(table_view const& input);
 ```
 
-## `rmm::device_memory_resource`
+## rmm::device_memory_resource
 
 libcudf allocates all device memory via RMM memory resources (MR). See the
 [RMM documentation](https://github.com/rapidsai/rmm/blob/main/README.md) for details.
@@ -197,7 +197,7 @@ RMM provides a "default" memory resource for each device that can be accessed an
 respectively. All memory resource parameters should be defaulted to use the return value of
 `rmm::mr::get_current_device_resource()`.
 
-## `cudf::column`
+## cudf::column
 
 `cudf::column` is a core owning data structure in libcudf. Most libcudf public APIs produce either
 a `cudf::column` or a `cudf::table` as output. A `column` contains `device_buffer`s which own the
@@ -209,6 +209,7 @@ Movable and copyable. A copy performs a deep copy of the column's contents, wher
 the contents from one column to another.
 
 Example:
+
 ```c++
 cudf::column col{...};
 
@@ -222,7 +223,7 @@ column_view v = moved_to; // Implicit conversion to non-owning column_view
 A `column` may have nested (child) columns, depending on the data type of the column. For example,
 `LIST`, `STRUCT`, and `STRING` type columns.
 
-### `cudf::column_view`
+### cudf::column_view
 
 `cudf::column_view` is a core non-owning data structure in libcudf. It is an immutable,
 non-owning view of device memory as a column. Most libcudf public APIs take views as inputs.
@@ -233,24 +234,24 @@ the view would return the element at index `75` of the owning `column`. Internal
 implemented by storing in the view a pointer, an offset, and a size. `column_view::data<T>()`
 returns a pointer iterator to `column_view::head<T>() + offset`.
 
-### `cudf::mutable_column_view`
+### cudf::mutable_column_view
 
 A *mutable*, non-owning view of device memory as a column. Used for detail APIs and (rare) public
 APIs that modify columns in place.
 
-### `cudf::column_device_view`
+### cudf::column_device_view
 
 An immutable, non-owning view of device data as a column of elements that is trivially copyable and
 usable in CUDA device code. Used to pass `column_view` data as input to CUDA kernels and device
 functions (including Thrust algorithms)
 
-### `cudf::mutable_column_device_view`
+### cudf::mutable_column_device_view
 
 A mutable, non-owning view of device data as a column of elements that is trivially copyable and
 usable in CUDA device code. Used to pass `column_view` data to be modified on the device by CUDA
 kernels and device functions (including Thrust algorithms).
 
-## `cudf::table`
+## cudf::table
 
 Owning class for a set of `cudf::column`s all with equal number of elements. This is the C++
 equivalent to a data frame.
@@ -260,11 +261,11 @@ Implicitly convertible to `cudf::table_view` and `cudf::mutable_table_view`
 Movable and copyable. A copy performs a deep copy of all columns, whereas a move moves all columns
 from one table to another.
 
-### `cudf::table_view`
+### cudf::table_view
 
 An *immutable*, non-owning view of a table.
 
-### `cudf::mutable_table_view`
+### cudf::mutable_table_view
 
 A *mutable*, non-owning view of a table.
 
@@ -292,7 +293,7 @@ template <typename T>
 std::vector<T> make_std_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 ```
 
-## `cudf::scalar`
+## cudf::scalar
 
 A `cudf::scalar` is an object that can represent a singular, nullable value of any of the types
 currently supported by cudf. Each type of value is represented by a separate type of scalar class
@@ -464,7 +465,7 @@ rmm::device_buffer some_function(
 libcudf code generally eschews raw pointers and direct memory allocation. Use RMM classes built to
 use `device_memory_resource`(*)s for device memory allocation with automated lifetime management.
 
-#### `rmm::device_buffer`
+#### rmm::device_buffer
 Allocates a specified number of bytes of untyped, uninitialized device memory using a
 `device_memory_resource`. If no resource is explicitly provided, uses
 `rmm::mr::get_current_device_resource()`.
@@ -490,7 +491,7 @@ custom_memory_resource *mr...;
 rmm::device_buffer custom_buff(100, mr, stream);
 ```
 
-#### `rmm::device_scalar<T>`
+#### rmm::device_scalar<T>
 Allocates a single element of the specified type initialized to the specified value. Use this for
 scalar input/outputs into device kernels, e.g., reduction results, null count, etc. This is
 effectively a convenience wrapper around a `rmm::device_vector<T>` of length 1.
@@ -508,7 +509,7 @@ kernel<<<...>>>(int_scalar.data(),...);
 int host_value = int_scalar.value();
 ```
 
-#### `rmm::device_vector<T>`
+#### rmm::device_vector<T>
 
 Allocates a specified number of elements of the specified type. If no initialization value is
 provided, all elements are default initialized (this incurs a kernel launch).
@@ -520,7 +521,7 @@ utilities enable creation of `uvector`s from host-side vectors, or creating zero
 `uvector`s, so that they are as convenient to use as `device_vector`. Avoiding `device_vector` has
 a number of benefits, as described in the following section on `rmm::device_uvector`.
 
-#### `rmm::device_uvector<T>`
+#### rmm::device_uvector<T>
 
 Similar to a `device_vector`, allocates a contiguous set of elements in device memory but with key
 differences:
@@ -544,7 +545,7 @@ rmm::mr::device_memory_resource * mr = new my_custom_resource{...};
 rmm::device_uvector<int32_t> v2{100, s, mr};
 ```
 
-## Input/Output Style<a name="inout_style"></a>
+## Input/Output Style
 
 The preferred style for how inputs are passed in and outputs are returned is the following:
 - Inputs
@@ -651,6 +652,7 @@ std::unique_ptr<column> copy_if_else(
   FilterFn filter,
   ...);
 ```
+
 `LeftIter` and `RightIter` need only implement the necessary interface for an iterator. libcudf
 provides a number of iterator types and utilities that are useful with iterator-based APIs from
 libcudf as well as Thrust algorithms. Most are defined in `include/detail/iterator.cuh`.
@@ -708,6 +710,7 @@ thrust::lower_bound(rmm::exec_policy(stream),
 
 ### External
 All public libcudf APIs should be placed in the `cudf` namespace. Example:
+
 ```c++
 namespace cudf{
    void public_function(...);
@@ -725,7 +728,7 @@ namespace.
 Many functions are not meant for public use, so place them in either the `detail` or an *anonymous*
 namespace, depending on the situation.
 
-#### `detail` namespace
+#### detail namespace
 
 Functions or objects that will be used across *multiple* translation units (i.e., source files),
 should be exposed in an internal header file and placed in the `detail` namespace. Example:
@@ -788,6 +791,7 @@ exceptions.
 Use the `CUDF_EXPECTS` macro to enforce runtime conditions necessary for correct execution.
 
 Example usage:
+
 ```c++
 CUDF_EXPECTS(lhs.type() == rhs.type(), "Column type mismatch");
 ```
@@ -803,6 +807,7 @@ Use the `CUDF_FAIL` macro for such errors. This is effectively the same as calli
 `CUDF_EXPECTS(false, reason)`.
 
 Example:
+
 ```c++
 CUDF_FAIL("This code path should not be reached.");
 ```
@@ -1121,13 +1126,11 @@ The second challenge is that in an out-of-place operation on a strings column, u
 width elements, the size of the output cannot be known *a priori*. For example, consider scattering
 into a column of strings:
 
-```c++
-destination:    {"this", "is", "a", "column", "of", "strings"}
-scatter_map:    {1, 3, 5}
-scatter_values: {"red", "green", "blue"}
+    destination:    {"this", "is", "a", "column", "of", "strings"}
+    scatter_map:    {1, 3, 5}
+    scatter_values: {"red", "green", "blue"}
 
-result:         {"this", "red", "a", "green", "of", "blue"}
-```
+    result:         {"this", "red", "a", "green", "of", "blue"}
 
 In this example, the strings "red", "green", and "blue" will respectively be scattered into
 positions `1`, `3`, and `5` of `destination`. Recall from above that this operation cannot be done
@@ -1151,7 +1154,7 @@ with the corresponding strings from either `destination` or `scatter_values`.
 
 libcudf provides view types for nested column types as well as for the data elements within them.
 
-### `cudf::strings_column_view` and `cudf::string_view`
+### cudf::strings_column_view and cudf::string_view
 
 `cudf::strings_column_view` is a view of a strings column, like `cudf::column_view` is a view of
 any `cudf::column`. `cudf::string_view` is a view of a single string, and therefore
@@ -1195,14 +1198,14 @@ most operations.
 The `string_view.cuh` header also includes some utility methods for reading and writing
 (`to_char_utf8/from_char_utf8`) individual UTF-8 characters to/from byte arrays.
 
-### `cudf::lists_column_view` and `cudf::lists_view`
+### cudf::lists_column_view and cudf::lists_view
 
 `cudf::lists_column_view` is a view of a lists column. `cudf::list_view` is a view of a single list,
 and therefore `cudf::list_view` is the data type of a `cudf::column` of type `LIST`.
 
 `cudf::type_dispatcher` dispatches to the `list_view` data type when invoked on a `LIST` column.
 
-### `cudf::structs_column_view` and `cudf::struct_view`
+### cudf::structs_column_view and cudf::struct_view
 
 `cudf::structs_column_view` is a view of a structs column. `cudf::struct_view` is a view of a single
 struct, and therefore `cudf::struct_view` is the data type of a `cudf::column` of type `STRUCT`.
diff --git a/cpp/docs/DOCUMENTATION.md b/cpp/doxygen/developer_guide/DOCUMENTATION.md
similarity index 50%
rename from cpp/docs/DOCUMENTATION.md
rename to cpp/doxygen/developer_guide/DOCUMENTATION.md
index f2de048d721..c9f38d5849b 100644
--- a/cpp/docs/DOCUMENTATION.md
+++ b/cpp/doxygen/developer_guide/DOCUMENTATION.md
@@ -7,23 +7,21 @@ These guidelines apply to documenting all libcudf C++ source files using doxygen
 The copyright comment is included here but may also be mentioned in a coding guideline document as well.
 The following is the license header comment that should appear at the beginning of every C++ source file.
 
-```c++
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-```
+    /*
+     * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+     *
+     * Licensed under the Apache License, Version 2.0 (the "License");
+     * you may not use this file except in compliance with the License.
+     * You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
 
 The comment should start with `/*` and not `/**` so it is not processed by doxygen.
 
@@ -36,8 +34,8 @@ Changing the copyright year may not be necessary if no content has changed (e.g.
 
 ## Doxygen
 
-The [doxygen tool](http://www.doxygen.nl/manual/index.html) is used to generate HTML pages from the C++ comments in the source code.
-Doxygen recognizes and parses block comments and performs specialized output formatting when it encounters [doxygen commands](http://www.doxygen.nl/manual/commands.html).
+The [doxygen tool](https://www.doxygen.nl/manual/index.html) is used to generate HTML pages from the C++ comments in the source code.
+Doxygen recognizes and parses block comments and performs specialized output formatting when it encounters [doxygen commands](https://www.doxygen.nl/manual/commands.html).
 
 There are almost 200 commands (also called tags in this document) that doxygen recognizes in comment blocks.
 This document provides guidance on which commands/tags to use and how to use them in the libcudf C++ source code.
@@ -57,12 +55,10 @@ Here are some of the custom options in the Doxyfile for libcudf.
 
 Use the following style for block comments describing functions, classes and other types, groups, and files.
 
-```c++
-/**
- * description text and
- * doxygen tags go here
- */
-```
+    /**
+     * description text and
+     * doxygen tags go here
+     */
 
 Doxygen comment blocks start with `/**` and end with `*/` only, and with nothing else on those lines.
 Do not add dashes `-----` or extra asterisks `*****` to the first and last lines of a doxygen block.
@@ -75,7 +71,7 @@ Any text on these lines, including tag declarations, should start after a single
 
 ## Tag/Command names
 
-Use `@` to prefix doxygen commands (e.g. `@brief`, `@code`, etc.)
+Use @ to prefix doxygen commands (e.g. \@brief, \@code, etc.)
 
 ## Markdown
 
@@ -91,102 +87,100 @@ Although doxygen supports markdown and markdown supports HTML tags, the HTML sup
 The following example covers most of the doxygen block comment and tag styles
 for documenting C++ code in libcudf.
 
-```c++
-/**
- * @file source_file.cpp
- * @brief Description of source file contents
- *
- * Longer description of the source file contents.
- */
-
-/**
- * @brief One sentence description of the class.
- *
- * @ingroup optional_predefined_group_id
- *
- * Longer, more detailed description of the class.
- *
- * @tparam T Short description of each template parameter
- * @tparam U Short description of each template parameter
- */
-template <typename T, typename U>
-class example_class {
-
-  void get_my_int();            ///< Simple members can be documented like this
-  void set_my_int( int value ); ///< Try to use descriptive member names
-
-  /**
-   * @brief Short, one sentence description of the member function.
-   *
-   * A more detailed description of what this function does and what
-   * its logic does.
-   *
-   * @code
-   * example_class<int> inst;
-   * inst.set_my_int(5);
-   * int output = inst.complicated_function(1,dptr,fptr);
-   * @endcode
-   *
-   * @param[in]     first  This parameter is an input parameter to the function
-   * @param[in,out] second This parameter is used both as an input and output
-   * @param[out]    third  This parameter is an output of the function
-   *
-   * @return The result of the complex function
-   */
-  T complicated_function(int first, double* second, float* third)
-  {
-      // Do not use doxygen-style block comments
-      // for code logic documentation.
-  }
-
- private:
-  int my_int;                ///< An example private member variable
-};
-
-/**
- * @brief Short, one sentence description of this free function.
- *
- * @ingroup optional_predefined_group_id
- *
- * A detailed description must start after a blank line.
- *
- * @code
- * template<typename T>
- * struct myfunctor {
- *   bool operator()(T input) { return input % 2 > 0; }
- * };
- * free_function<myfunctor,int>(myfunctor{},12);
- * @endcode
- *
- * @throw cudf::logic_error if `input_argument` is negative or zero
- *
- * @tparam functor_type The type of the functor
- * @tparam input_type The datatype of the input argument
- *
- * @param[in] functor        The functor to be called on the input argument
- * @param[in] input_argument The input argument passed into the functor
- * @return The result of calling the functor on the input argument
- */
-template <class functor_type, typename input_type>
-bool free_function(functor_type functor, input_type input_argument)
-{
-  CUDF_EXPECTS( input_argument > 0, "input_argument must be positive");
-  return functor(input_argument);
-}
-
-/**
- * @brief Short, one sentence description.
- *
- * @ingroup optional_predefined_group_id
- *
- * Optional, longer description.
- */
-enum class example_enum {
-  first_enum,   ///< Description of the first enum
-  second_enum,  ///< Description of the second enum
-  third_enum    ///< Description of the third enum
-};
-```
+    /**
+     * @file source_file.cpp
+     * @brief Description of source file contents
+     *
+     * Longer description of the source file contents.
+     */
+
+    /**
+     * @brief One sentence description of the class.
+     *
+     * @ingroup optional_predefined_group_id
+     *
+     * Longer, more detailed description of the class.
+     *
+     * @tparam T Short description of each template parameter
+     * @tparam U Short description of each template parameter
+     */
+    template <typename T, typename U>
+    class example_class {
+
+      void get_my_int();            ///< Simple members can be documented like this
+      void set_my_int( int value ); ///< Try to use descriptive member names
+
+      /**
+       * @brief Short, one sentence description of the member function.
+       *
+       * A more detailed description of what this function does and what
+       * its logic does.
+       *
+       * @code
+       * example_class<int> inst;
+       * inst.set_my_int(5);
+       * int output = inst.complicated_function(1,dptr,fptr);
+       * @endcode
+       *
+       * @param[in]     first  This parameter is an input parameter to the function
+       * @param[in,out] second This parameter is used both as an input and output
+       * @param[out]    third  This parameter is an output of the function
+       *
+       * @return The result of the complex function
+       */
+      T complicated_function(int first, double* second, float* third)
+      {
+          // Do not use doxygen-style block comments
+          // for code logic documentation.
+      }
+
+     private:
+      int my_int;                ///< An example private member variable
+    };
+
+    /**
+     * @brief Short, one sentence description of this free function.
+     *
+     * @ingroup optional_predefined_group_id
+     *
+     * A detailed description must start after a blank line.
+     *
+     * @code
+     * template<typename T>
+     * struct myfunctor {
+     *   bool operator()(T input) { return input % 2 > 0; }
+     * };
+     * free_function<myfunctor,int>(myfunctor{},12);
+     * @endcode
+     *
+     * @throw cudf::logic_error if `input_argument` is negative or zero
+     *
+     * @tparam functor_type The type of the functor
+     * @tparam input_type The datatype of the input argument
+     *
+     * @param[in] functor        The functor to be called on the input argument
+     * @param[in] input_argument The input argument passed into the functor
+     * @return The result of calling the functor on the input argument
+     */
+    template <class functor_type, typename input_type>
+    bool free_function(functor_type functor, input_type input_argument)
+    {
+      CUDF_EXPECTS( input_argument > 0, "input_argument must be positive");
+      return functor(input_argument);
+    }
+
+    /**
+     * @brief Short, one sentence description.
+     *
+     * @ingroup optional_predefined_group_id
+     *
+     * Optional, longer description.
+     */
+    enum class example_enum {
+      first_enum,   ///< Description of the first enum
+      second_enum,  ///< Description of the second enum
+      third_enum    ///< Description of the third enum
+    };
 
 ## Descriptions
 
@@ -198,45 +192,39 @@ Also, try to include a short [example](#inline-examples) if possible.
 
 ### @brief
 
-The `@brief` text should be a short, one sentence description.
+The [\@brief](https://www.doxygen.nl/manual/commands.html#cmdbrief) text should be a short, one sentence description.
 Doxygen does not provide much space to show this text in the output pages.
-Always follow the `@brief` line with a blank comment line.
+Always follow the \@brief line with a blank comment line.
 
 The longer description is the rest of the comment text that is not tagged with any doxygen command.
 
-```c++
-/**
- * @brief Short description.
- *
- * Long description.
- *
-```
+    /**
+     * @brief Short description.
+     *
+     * Long description.
+     *
 
-### @copydoc
+### \@copydoc
 
 Documentation for declarations in headers should be clear and complete.
-You can use the `@copydoc` tag to avoid duplicating the comment block for a function definition.
-
-```c++
-  /**
-   * @copydoc complicated_function(int,double*,float*)
-   *
-   * Any extra documentation.
-   */
-```
-
-Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `stream` parameter.
-
-```c++
-/**
- * @copydoc cudf::segmented_count_set_bits(bitmask_type const*,std::vector<size_type> const&)
- *
- * @param[in] stream Optional CUDA stream on which to execute kernels
- */
-std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
-                                                std::vector<size_type> const& indices,
-                                                rmm::cuda_stream_view stream = cudf::default_stream_value);
-```
+You can use the [\@copydoc](https://www.doxygen.nl/manual/commands.html#cmdcopydoc) tag to avoid duplicating the comment block for a function definition.
+
+      /**
+       * @copydoc complicated_function(int,double*,float*)
+       *
+       * Any extra documentation.
+       */
+
+Also, \@copydoc is useful when documenting a `detail` function that differs only by the `stream` parameter.
+
+    /**
+     * @copydoc cudf::segmented_count_set_bits(bitmask_type const*,std::vector<size_type> const&)
+     *
+     * @param[in] stream Optional CUDA stream on which to execute kernels
+     */
+    std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
+                                                    std::vector<size_type> const& indices,
+                                                    rmm::cuda_stream_view stream = cudf::default_stream_value);
 
 Note, you must specify the whole signature of the function, including optional parameters, so that doxygen will be able to locate it.
 
@@ -246,156 +234,136 @@ The following tags should appear near the end of function comment block in the o
 
 | Command | Description |
 | ------- | ----------- |
-| [@throw](#throw) | Specify the conditions in which the function may throw an exception |
-| [@tparam](#tparam) | Description for each template parameter |
-| [@param](#param) | Description for each function parameter |
-| [@return](#return) | Short description of object or value returned |
+| [\@throw](#throw) | Specify the conditions in which the function may throw an exception |
+| [\@tparam](#tparam) | Description for each template parameter |
+| [\@param](#param) | Description for each function parameter |
+| [\@return](#return) | Short description of object or value returned |
 
-#### @throw
+#### \@throw
 
-Add an [@throw](http://www.doxygen.nl/manual/commands.html#cmdthrow) comment line in the doxygen block for each exception that the function may throw.
+Add an [\@throw](https://www.doxygen.nl/manual/commands.html#cmdthrow) comment line in the doxygen block for each exception that the function may throw.
 You only need to include exceptions thrown by the function itself.
 If the function calls another function that may throw an exception, you do not need to document those exceptions here.
 
 Include the name of the exception without backtick marks so doxygen can add reference links correctly.
 
-```c++
- *
- * @throw cudf::logic_error if `input_argument` is negative or zero
- *
-```
+     *
+     * @throw cudf::logic_error if `input_argument` is negative or zero
+     *
 
-Using `@throws` is also acceptable but vs-code and other tools only do syntax highlighting on `@throw`.
+Using \@throws is also acceptable but VS Code and other tools only do syntax highlighting on \@throw.
 
 #### @tparam
 
-Add a [@tparam](http://www.doxygen.nl/manual/commands.html#cmdtparam) comment line for each template parameter declared by this function.
+Add a [\@tparam](https://www.doxygen.nl/manual/commands.html#cmdtparam) comment line for each template parameter declared by this function.
 The name of the parameter specified after the doxygen tag must match exactly to the template parameter name.
 
-```c++
- *
- * @tparam functor_type The type of the functor
- * @tparam input_type The datatype of the input argument
- *
-```
+     *
+     * @tparam functor_type The type of the functor
+     * @tparam input_type The datatype of the input argument
+     *
 
 The definition should detail the requirements of the parameter.
 For example, if the template is for a functor or predicate, then describe the expected input types and output.
 
 #### @param
 
-Add a [@param](http://www.doxygen.nl/manual/commands.html#cmdparam) comment line for each function parameter passed to this function.
+Add a [\@param](https://www.doxygen.nl/manual/commands.html#cmdparam) comment line for each function parameter passed to this function.
 The name of the parameter specified after the doxygen tag must match the function's parameter name.
 Also include append `[in]`, `[out]` or `[in,out]` to the `@param` if it is not clear from the declaration and the parameter name itself.
 
-```c++
- *
- * @param[in]     first  This parameter is an input parameter to the function
- * @param[in,out] second This parameter is used both as an input and output
- * @param[out]    third  This parameter is an output of the function
- *
-```
+     *
+     * @param[in]     first  This parameter is an input parameter to the function
+     * @param[in,out] second This parameter is used both as an input and output
+     * @param[out]    third  This parameter is an output of the function
+     *
 
 It is also recommended to vertically aligning the 3 columns of text if possible to make it easier to read in a source code editor.
 
 #### @return
 
-Add a single [@return](http://www.doxygen.nl/manual/commands.html#cmdreturn) comment line at the end of the comment block if the function returns an object or value.
+Add a single [\@return](https://www.doxygen.nl/manual/commands.html#cmdreturn) comment line at the end of the comment block if the function returns an object or value.
 Include a brief description of what is returned.
 
-```c++
-/**
- * ...
- *
- * @return A new column of type INT32 and no nulls
- */
-```
+    /**
+     * ...
+     *
+     * @return A new column of type INT32 and no nulls
+     */
 
 Do not include the type of the object returned with the `@return` comment.
 
 ### Inline Examples
 
 It is usually helpful to include a source code example inside your comment block when documenting a function or other declaration.
-Use the [@code](http://www.doxygen.nl/manual/commands.html#cmdcode) and [@endcode](http://www.doxygen.nl/manual/commands.html#cmdendcode) pair to include inline examples.
+Use the [\@code](https://www.doxygen.nl/manual/commands.html#cmdcode) and [\@endcode](https://www.doxygen.nl/manual/commands.html#cmdendcode) pair to include inline examples.
 
 Doxygen supports syntax highlighting for C++ and several other programming languages (e.g. Python, Java).
-By default, the `@code` tag uses syntax highlighting based on the source code in which it is found.
+By default, the \@code tag uses syntax highlighting based on the source code in which it is found.
 
-```c++
- *
- * @code
- * auto result = cudf::make_column( );
- * @endcode
- *
-```
+     *
+     * @code
+     * auto result = cudf::make_column( );
+     * @endcode
+     *
 
 You can specify a different language by indicating the file extension in the tag:
 
-```c++
- *
- * @code{.py}
- * import cudf
- * s = cudf.Series([1,2,3])
- * @endcode
- *
-```
-
-If you wish to use pseudo-code in your example, use the following:
-
-```c++
- *
- * Sometimes pseudo-code is clearer.
- * @code{.pseudo}
- * s = int column of [ 1, 2, null, 4 ]
- * r = fill( s, [1, 2], 0 )
- * r is now [ 1, 0, 0, 4 ]
- * @endcode
- *
-```
+     *
+     * @code{.py}
+     * import cudf
+     * s = cudf.Series([1,2,3])
+     * @endcode
+     *
+
+If you wish to use pseudocode in your example, use the following:
+
+     *
+     * Sometimes pseudocode is clearer.
+     * @code{.pseudo}
+     * s = int column of [ 1, 2, null, 4 ]
+     * r = fill( s, [1, 2], 0 )
+     * r is now [ 1, 0, 0, 4 ]
+     * @endcode
+     *
 
 When writing example snippets, using fully qualified class names allows doxygen to add reference links to the example.
 
-```c++
- *
- * @code
- * auto result1 = make_column( ); // reference link will not be created
- * auto result2 = cudf::make_column( ); // reference link will be created
- * @endcode
- *
-```
+     *
+     * @code
+     * auto result1 = make_column( ); // reference link will not be created
+     * auto result2 = cudf::make_column( ); // reference link will be created
+     * @endcode
+     *
 
-Although using 3 backtick marks `` ``` `` for example blocks will work too, they do not standout as well in vs-code and other source editors.
+Although using 3 backtick marks \`\`\` for example blocks will work too, they do not stand out as well in VS Code and other source editors.
 
 Do not use the `@example` tag in the comments for a declaration, or doxygen will interpret the entire source file as example source code.
 The source file is then published under a separate _Examples_ page in the output.
 
 ### Deprecations
 
-Add a single [@deprecated](https://www.doxygen.nl/manual/commands.html#cmddeprecated) comment line
+Add a single [\@deprecated](https://www.doxygen.nl/manual/commands.html#cmddeprecated) comment line
 to comment blocks for APIs that will be removed in future releases. Mention alternative /
 replacement APIs in the deprecation comment.
 
-```c++
-/**
- * ...
- *
- * @deprecated This function is deprecated. Use another new function instead.
- */
-```
+    /**
+     * ...
+     *
+     * @deprecated This function is deprecated. Use another new function instead.
+     */
 
 ## Namespaces
 
 Doxygen output includes a _Namespaces_ page that shows all the namespaces declared with comment blocks in the processed files.
 Here is an example of a doxygen description comment for a namespace declaration.
 
-```c++
-/**
- * @brief cuDF interfaces
- *
- * This is the top-level namespace which contains all cuDF functions and types.
- */
-namespace cudf {
-```
+    /**
+     * @brief cuDF interfaces
+     *
+     * This is the top-level namespace which contains all cuDF functions and types.
+     */
+    namespace cudf {
 
 A description comment should be included only once for each unique namespace declaration.
 Otherwise, if more than one description is found, doxygen aggregates the descriptions in an arbitrary order in the output pages.
@@ -406,7 +374,7 @@ If you introduce a new namespace, provide a description block for only one decla
 
 Grouping declarations into modules helps users to find APIs in the doxygen pages.
 Generally, common functions are already grouped logically into header files but doxygen does not automatically group them this way in its output.
-The doxygen output includes a _Modules_ page that organizes items into groups specified using the [Grouping doxygen commands](http://www.doxygen.nl/manual/grouping.html).
+The doxygen output includes a _Modules_ page that organizes items into groups specified using the [Grouping doxygen commands](https://www.doxygen.nl/manual/grouping.html).
 These commands can group common functions across header files, source files, and even namespaces.
 Groups can also be nested by defining new groups within existing groups.
 
@@ -415,48 +383,44 @@ The [doxygen_groups.h](../include/doxygen_groups.h) file does not need to be inc
 Modify this file only to add or update groups.
 The existing groups have been carefully structured and named, so new groups should be added thoughtfully.
 
-When creating a new API, specify its group using the [@ingroup](http://www.doxygen.nl/manual/commands.html#cmdingroup) tag and the group reference id from the [doxygen_groups.h](../include/doxygen_groups.h) file.
-
-```c++
-namespace cudf {
-
-/**
- * @brief ...
- *
- * @ingroup transformation_fill
- *
- * @param ...
- * @return ...
- */
-std::unique_ptr<column> fill(table_view const& input,...);
-
-}  // namespace cudf
-```
-
-You can also use the `@addtogroup` with a `@{ ... @}` pair to automatically include  doxygen comment blocks as part of a group.
-
-```c++
-namespace cudf {
-/**
- * @addtogroup transformation_fill
- * @{
- */
-
-/**
- * @brief ...
- *
- * @param ...
- * @return ...
- */
-std::unique_ptr<column> fill(table_view const& input,...);
-
-/** @} */
-}  // namespace cudf
-```
-
-This just saves adding `@ingroup` to individual doxygen comment blocks within a file.
-Make sure a blank line is included after the `@addtogroup` command block so doxygen knows it does not apply to whatever follows in the source code.
-Note that doxygen will not assign groups to items if the `@addtogroup` with `@{ ... @}` pair includes a namespace declaration.
+When creating a new API, specify its group using the [\@ingroup](https://www.doxygen.nl/manual/commands.html#cmdingroup) tag and the group reference id from the [doxygen_groups.h](../include/doxygen_groups.h) file.
+
+    namespace cudf {
+
+    /**
+     * @brief ...
+     *
+     * @ingroup transformation_fill
+     *
+     * @param ...
+     * @return ...
+     */
+    std::unique_ptr<column> fill(table_view const& input,...);
+
+    }  // namespace cudf
+
+You can also use the \@addtogroup with a `@{ ... @}` pair to automatically include doxygen comment blocks as part of a group.
+
+    namespace cudf {
+    /**
+     * @addtogroup transformation_fill
+     * @{
+     */
+
+    /**
+     * @brief ...
+     *
+     * @param ...
+     * @return ...
+     */
+    std::unique_ptr<column> fill(table_view const& input,...);
+
+    /** @} */
+    }  // namespace cudf
+
+This just saves adding \@ingroup to individual doxygen comment blocks within a file.
+Make sure a blank line is included after the \@addtogroup command block so doxygen knows it does not apply to whatever follows in the source code.
+Note that doxygen will not assign groups to items if the \@addtogroup with `@{ ... @}` pair includes a namespace declaration.
 So include the `@addtogroup` and `@{ ... @}` between the namespace declaration braces as shown in the example above.
 
 Summary of groups tags
@@ -470,7 +434,7 @@ Summary of groups tags
 ## Build Doxygen Output
 
 We recommend installing Doxygen using conda (`conda install doxygen`) or a Linux package manager (`sudo apt install doxygen`).
-Alternatively you can [build and install doxygen from source](http://www.doxygen.nl/manual/install.html).
+Alternatively you can [build and install doxygen from source](https://www.doxygen.nl/manual/install.html).
 
 To build the libcudf HTML documentation simply run the `doxygen` command from the `cpp/doxygen` directory containing the `Doxyfile`.
 The libcudf documentation can also be built using `make docs_cudf` from the cmake build directory (e.g. `cpp/build`).
@@ -479,7 +443,7 @@ The output is generated in the `cpp/doxygen/html/` directory.
 You can load the local `index.html` file generated there into any web browser to view the result.
 
 To view docs built on a remote server, you can run a simple HTTP server using Python: `cd html && python -m http.server`.
-Then open `http://<IP address>:8000` in your local web browser, inserting the IP address of the machine on which you ran the HTTP server.
+Then open `<IP address>:8000` in your local web browser, inserting the IP address of the machine on which you ran the HTTP server.
 
 The doxygen output is intended for building documentation only for the public APIs and classes.
 For example, the output should not include documentation for `detail` or `/src` files, and these directories are excluded in the `Doxyfile` configuration.
diff --git a/cpp/docs/TESTING.md b/cpp/doxygen/developer_guide/TESTING.md
similarity index 92%
rename from cpp/docs/TESTING.md
rename to cpp/doxygen/developer_guide/TESTING.md
index e622e108593..31747e31ccb 100644
--- a/cpp/docs/TESTING.md
+++ b/cpp/doxygen/developer_guide/TESTING.md
@@ -62,13 +62,12 @@ files, and are therefore preferred in test code over `thrust::device_vector`.
 All libcudf unit tests should make use of a GTest ["Test Fixture"](https://github.com/google/googletest/blob/master/docs/primer.md#test-fixtures-using-the-same-data-configuration-for-multiple-tests-same-data-multiple-tests).
 Even if the fixture is empty, it should inherit from the base fixture `cudf::test::BaseFixture`
 found in `include/cudf_test/base_fixture.hpp`. This ensures that RMM is properly initialized and
-finalized. `cudf::test::BaseFixture` already inherits from `::testing::Test` and therefore it is
+finalized. `cudf::test::BaseFixture` already inherits from `testing::Test` and therefore it is
 not necessary for your test fixtures to inherit from it.
 
 Example:
-```c++
-class MyTestFixture : public cudf::test::BaseFixture {...};
-```
+
+    class MyTestFixture : public cudf::test::BaseFixture {...};
 
 ## Typed Tests
 
@@ -79,6 +78,7 @@ the same tests across multiple types, we use GTest's
 Typed tests allow you to write a test once and run it across a list of types.
 
 For example:
+
 ```c++
 // Fixture must be a template
 template <typename T>
@@ -91,7 +91,7 @@ TYPED_TEST(TypedTestFixture, FirstTest){
 }
 ```
 
-To specify the list of types to use, instead of GTest's `::testing::Types<...>`, libcudf provides `cudf::test::Types<...>` which is a custom, drop-in replacement for `::testing::Types`.
+To specify the list of types to use, instead of GTest's `testing::Types<...>`, libcudf provides `cudf::test::Types<...>` which is a custom, drop-in replacement for `testing::Types`.
 In this example, all tests using the `TypedTestFixture` fixture will run once for each type in the
 list defined in `TestTypes` (`int, float, double`).
 
@@ -176,7 +176,7 @@ for initializing a `cudf::column` object usable with libcudf APIs. Any `*_column
 implicitly convertible to a `column_view` or `mutable_column_view` and therefore may be
 transparently passed to any API expecting a `column_view` or `mutable_column_view` argument.
 
-#### `fixed_width_column_wrapper`
+#### fixed_width_column_wrapper
 
 The `fixed_width_column_wrapper` class should be used for constructing and initializing columns of
 any fixed-width element type, e.g., numeric types, timestamp types, Boolean, etc.
@@ -194,7 +194,7 @@ fixed_width_column_wrapper<int32_t> w(elements, elements + 5);
 
 // Creates a nullable column of INT32 elements with 5 elements: {null, 1, null, 3, null}
 auto elements = make_counting_transform_iterator(0, [](auto i){return i;});
-auto validity = make_counting_transform_iterator(0, [](auto i){return i%2;})
+auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;})
 fixed_width_column_wrapper<int32_t> w(elements, elements + 5, validity);
 
 // Creates a non-nullable INT32 column with 4 elements: {1, 2, 3, 4}
@@ -204,7 +204,7 @@ fixed_width_column_wrapper<int32_t> w{{1, 2, 3, 4}};
 fixed_width_column_wrapper<int32_t> w{ {1,2,3,4}, {1, 0, 1, 0}};
 ```
 
-#### `fixed_point_column_wrapper`
+#### fixed_point_column_wrapper
 
 The `fixed_point_column_wrapper` class should be used for constructing and initializing columns of
 any fixed-point element type (DECIMAL32 or DECIMAL64). `fixed_point_column_wrapper` provides
@@ -213,18 +213,19 @@ columns, an additional iterator can be provided to indicate the validity of each
 Constructors also take the scale of the fixed-point values to create.
 
 Example:
+
 ```c++
-// Creates a non-nullable column of 4 DECIMAL32 elements of scale 3: {1000, 2000, 3000, 4000}
-auto elements = make_counting_transform_iterator(0, [](auto i){ return i; });
-fixed_point_column_wrapper<int32_t> w(elements, elements + 4, 3);
-
-// Creates a nullable column of 5 DECIMAL32 elements of scale 2: {null, 100, null, 300, null}
-auto elements = make_counting_transform_iterator(0, [](auto i){ return i; });
-auto validity = make_counting_transform_iterator(0, [](auto i){ return i%2; });
-fixed_point_column_wrapper<int32_t> w(elements, elements + 5, validity, 2);
+    // Creates a non-nullable column of 4 DECIMAL32 elements of scale 3: {1000, 2000, 3000, 4000}
+    auto elements = make_counting_transform_iterator(0, [](auto i){ return i; });
+    fixed_point_column_wrapper<int32_t> w(elements, elements + 4, 3);
+
+    // Creates a nullable column of 5 DECIMAL32 elements of scale 2: {null, 100, null, 300, null}
+    auto elements = make_counting_transform_iterator(0, [](auto i){ return i; });
+    auto validity = make_counting_transform_iterator(0, [](auto i){ return i % 2; });
+    fixed_point_column_wrapper<int32_t> w(elements, elements + 5, validity, 2);
 ```
 
-#### `dictionary_column_wrapper`
+#### dictionary_column_wrapper
 
 The `dictionary_column_wrapper` class should be used to create dictionary columns.
 `dictionary_column_wrapper` provides constructors that accept an iterator range to generate each
@@ -233,6 +234,7 @@ validity of each element. There are also constructors that accept a `std::initia
 the column elements and optionally for the validity of each element.
 
 Example:
+
 ```c++
 // Creates a non-nullable dictionary column of INT32 elements with 5 elements
 // keys = {0, 2, 6}, indices = {0, 1, 1, 2, 2}
@@ -242,7 +244,7 @@ dictionary_column_wrapper<int32_t> w(element.begin(), elements.end());
 // Creates a nullable dictionary column with 5 elements and a validity iterator.
 std::vector<int32_t> elements{0, 2, 0, 6, 0};
 // Validity iterator here sets even rows to null.
-auto validity = make_counting_transform_iterator(0, [](auto i){return i%2;})
+auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;})
 // keys = {2, 6}, indices = {NULL, 0, NULL, 1, NULL}
 dictionary_column_wrapper<int32_t> w(elements, elements + 5, validity);
 
@@ -267,11 +269,11 @@ dictionary_column_wrapper<std::string> d(strings.begin(), strings.end());
 // Creates a nullable dictionary column with 7 string elements and a validity iterator.
 // Validity iterator here sets even rows to null.
 // keys = {"a", "bb"}, indices = {NULL, 1, NULL, 1, NULL, 0, NULL}
-auto validity = make_counting_transform_iterator(0, [](auto i){return i%2;});
+auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;});
 dictionary_column_wrapper<std::string> d({"", "bb", "", "bb", "", "a", ""}, validity);
 ```
 
-#### `strings_column_wrapper`
+#### strings_column_wrapper
 
 The `strings_column_wrapper` class should be used to create columns of strings. It provides
 constructors that accept an iterator range to generate each string in the column. For nullable
@@ -280,6 +282,7 @@ also constructors that accept a `std::initializer_list<std::string>` for the col
 optionally for the validity of each element.
 
 Example:
+
 ```c++
 // Creates a non-nullable STRING column with 7 string elements:
 // {"", "this", "is", "a", "column", "of", "strings"}
@@ -289,7 +292,7 @@ strings_column_wrapper s(strings.begin(), strings.end());
 // Creates a nullable STRING column with 7 string elements:
 // {NULL, "this", NULL, "a", NULL, "of", NULL}
 std::vector<std::string> strings{"", "this", "is", "a", "column", "of", "strings"};
-auto validity = make_counting_transform_iterator(0, [](auto i){return i%2;});
+auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;});
 strings_column_wrapper s(strings.begin(), strings.end(), validity);
 
 // Creates a non-nullable STRING column with 7 string elements:
@@ -298,11 +301,11 @@ strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"});
 
 // Creates a nullable STRING column with 7 string elements:
 // {NULL, "this", NULL, "a", NULL, "of", NULL}
-auto validity = make_counting_transform_iterator(0, [](auto i){return i%2;});
+auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;});
 strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"}, validity);
 ```
 
-#### `lists_column_wrapper`
+#### lists_column_wrapper
 
 The `lists_column_wrapper` class should be used to create columns of lists. It provides
 constructors that accept an iterator range to generate each list in the column. For nullable
@@ -311,6 +314,7 @@ also constructors that accept a `std::initializer_list<T>` for the column's list
 optionally for the validity of each element. A number of other constructors are available.
 
 Example:
+
 ```c++
 // Creates an empty LIST column
 // []
@@ -336,13 +340,13 @@ lists_column_wrapper l(elements, elements+5);
 
 // Creates a LIST column with 1 lists composed of 2 total integers
 // [{0, NULL}]
-auto validity = make_counting_transform_iterator(0, [](auto i){return i%2;});
+auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;});
 lists_column_wrapper l{{0, 1}, validity};
 
 // Creates a LIST column with 1 lists composed of 5 total integers
 // [{0, NULL, 2, NULL, 4}]
 auto elements = make_counting_transform_iterator(0, [](auto i){return i*2;});
-auto validity = make_counting_transform_iterator(0, [](auto i){return i%2;});
+auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;});
 lists_column_wrapper l(elements, elements+5, validity);
 
 // Creates a LIST column with 1 list composed of 2 total strings
@@ -351,11 +355,11 @@ lists_column_wrapper l{"abc", "def"};
 
 // Creates a LIST of LIST columns with 2 lists on the top level and 4 below
 // [ {{0, 1}, NULL}, {{4, 5}, NULL} ]
-auto validity = make_counting_transform_iterator(0, [](auto i){return i%2;});
+auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;});
 lists_column_wrapper l{ {{{0, 1}, {2, 3}}, validity}, {{{4, 5}, {6, 7}}, validity} };
 ```
 
-#### `structs_column_wrapper`
+#### structs_column_wrapper
 
 The `structs_column_wrapper` class should be used to create columns of structs. It provides
 constructors that accept a vector or initializer list of pre-constructed columns or column wrappers
@@ -363,6 +367,7 @@ for child columns. For nullable columns, an additional iterator can be provided
 validity of each struct.
 
 Examples:
+
 ```c++
 // The following constructs a column for struct< int, string >.
 auto child_int_col = fixed_width_column_wrapper<int32_t>{ 1, 2, 3, 4, 5 }.release();
@@ -378,9 +383,7 @@ struct_column_wrapper struct_column_wrapper{
 };
 
 auto struct_col {struct_column_wrapper.release()};
-```
 
-```c++
 // The following constructs a column for struct< int, string >.
 fixed_width_column_wrapper<int32_t> child_int_col_wrapper{ 1, 2, 3, 4, 5 };
 string_column_wrapper child_string_col_wrapper {"All", "the", "leaves", "are", "brown"};
@@ -391,16 +394,14 @@ struct_column_wrapper struct_column_wrapper{
 };
 
 auto struct_col {struct_column_wrapper.release()};
-```
 
-```c++
 // The following constructs a column for struct< int, string >.
 fixed_width_column_wrapper<int32_t> child_int_col_wrapper{ 1, 2, 3, 4, 5 };
 string_column_wrapper child_string_col_wrapper {"All", "the", "leaves", "are", "brown"};
 
 struct_column_wrapper struct_column_wrapper{
   {child_int_col_wrapper, child_string_col_wrapper}
-  cudf::detail::make_counting_transform_iterator(0, [](auto i){ return i%2; }) // Validity
+  cudf::detail::make_counting_transform_iterator(0, [](auto i){ return i % 2; }) // Validity
 };
 
 auto struct_col {struct_column_wrapper.release()};
@@ -411,12 +412,12 @@ auto struct_col {struct_column_wrapper.release()};
 A common operation in testing is verifying that two columns are equal, or equivalent, or that they
 have the same metadata.
 
-#### `CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL`
+#### CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL
 
 Verifies that two columns have the same type, size, and nullability. For nested types, recursively
 verifies the equality of type, size and nullability of all nested children.
 
-#### `CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUIVALENT`
+#### CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUIVALENT
 
 Verifies that two columns have equivalent type and equal size, ignoring nullability. For nested
 types, recursively verifies the equivalence of type, and equality of size of all nested children,
@@ -428,17 +429,17 @@ different scales. Nested type columns can be equivalent in the case where they b
 but one has children (also empty) and the other does not. For columns with nonzero size, both equals
 and equivalent expect equal number of children.
 
-#### `CUDF_TEST_EXPECT_COLUMNS_EQUAL`
+#### CUDF_TEST_EXPECT_COLUMNS_EQUAL
 
 Verifies that two columns have equal properties and verifies elementwise equality of the column
 data. Null elements are treated as equal.
 
-#### `CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT`
+#### CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT
 
 Verifies that two columns have equivalent properties and verifies elementwise equivalence of the
 column data. Null elements are treated as equivalent.
 
-#### `CUDF_TEST_EXPECT_EQUAL_BUFFERS`
+#### CUDF_TEST_EXPECT_EQUAL_BUFFERS
 
 Verifies the bitwise equality of two device memory buffers.
 
@@ -446,4 +447,4 @@ Verifies the bitwise equality of two device memory buffers.
 
 `include/cudf_test/column_utilities.hpp` defines various functions and overloads for printing
 columns (`print`), converting column data to string (`to_string`, `to_strings`), and copying data to
-the host (`to_host).
+the host (`to_host`).
diff --git a/cpp/docs/strings.png b/cpp/doxygen/developer_guide/strings.png
similarity index 100%
rename from cpp/docs/strings.png
rename to cpp/doxygen/developer_guide/strings.png
diff --git a/cpp/doxygen/modify_fences.sh b/cpp/doxygen/modify_fences.sh
new file mode 100755
index 00000000000..195f60d8abc
--- /dev/null
+++ b/cpp/doxygen/modify_fences.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+# This script modifies the GitHub Markdown style code fences in our MD files
+# into the PHP style that Doxygen supports, allowing us to display code
+# properly both on the GitHub GUI and in published Doxygen documentation.
+
+sed 's/```c++/```{.cpp}/g' "$@"

From 4681bdc9bdbb5e811a500336e810b2fae7481fc2 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 12 Sep 2022 18:17:18 -0400
Subject: [PATCH 14/25] Ensure that all cudf tests and benchmarks are conda env
 aware (#11666)

This is needed so that `rapids-cmake` population of `rpath-link` link options is provided to all executables.

Fixes #11628

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Keith Kraus (https://github.com/kkraus14)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11666
---
 cpp/CMakeLists.txt            |  6 +++++-
 cpp/benchmarks/CMakeLists.txt | 10 +++++++---
 cpp/tests/CMakeLists.txt      |  5 ++++-
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 69394a34624..ae33ad260d2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -710,7 +710,11 @@ target_compile_options(
                       "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
 )
 
-target_link_libraries(cudftestutil PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf)
+target_link_libraries(
+  cudftestutil
+  PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf
+  PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
+)
 
 target_include_directories(
   cudftestutil PUBLIC "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index fb46d1b583e..bbd51546668 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -23,8 +23,10 @@ target_compile_options(
 )
 
 target_link_libraries(
-  cudf_datagen PUBLIC GTest::gmock GTest::gtest GTest::gmock_main GTest::gtest_main
-                      benchmark::benchmark nvbench::nvbench Threads::Threads cudf cudftestutil
+  cudf_datagen
+  PUBLIC GTest::gmock GTest::gtest GTest::gmock_main GTest::gtest_main benchmark::benchmark
+         nvbench::nvbench Threads::Threads cudf cudftestutil
+  PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
 target_include_directories(
@@ -41,7 +43,7 @@ add_library(
   cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp"
                                synchronization/synchronization.cpp io/cuio_common.cpp
 )
-target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen)
+target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
 add_custom_command(
   OUTPUT CUDF_BENCHMARKS
   COMMAND echo Running benchmarks
@@ -68,6 +70,7 @@ function(ConfigureBench CMAKE_BENCH_NAME)
   )
   target_link_libraries(
     ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main
+                                $<TARGET_NAME_IF_EXISTS:conda_env>
   )
   add_custom_command(
     OUTPUT CUDF_BENCHMARKS
@@ -96,6 +99,7 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
   )
   target_link_libraries(
     ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen nvbench::main
+                                $<TARGET_NAME_IF_EXISTS:conda_env>
   )
   install(
     TARGETS ${CMAKE_BENCH_NAME}
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 99267d37318..6a36e616268 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -31,7 +31,10 @@ function(ConfigureTest CMAKE_TEST_NAME)
                CUDA_STANDARD_REQUIRED ON
   )
 
-  target_link_libraries(${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main)
+  target_link_libraries(
+    ${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main
+                               $<TARGET_NAME_IF_EXISTS:conda_env>
+  )
   add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
   install(
     TARGETS ${CMAKE_TEST_NAME}

From e99e06905eb5f0e0ba7a4f548e796cdbffebd8f8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 12 Sep 2022 18:21:36 -0500
Subject: [PATCH 15/25] Fix an issue related to `Multindex` when
 `group_keys=True` (#11689)

When there are multiple groups in a groupby, passing `group_keys=True` raises an error:
```python
In [1]: import cudf

In [2]: gdf = cudf.DataFrame(
   ...: {"A": "a a b".split(), "B": [1, 1, 3], "C": [4, 6, 5]}
   ...: )

In [3]: g_group = gdf.groupby(["A", "B"], group_keys=True)

In [4]: g_group[["B", "C"]].apply(lambda x: x / x.sum())
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In [4], line 1
----> 1 g_group[["B", "C"]].apply(lambda x: x / x.sum())

File /nvme/0/pgali/envs/cudfdev/lib/python3.9/site-packages/cudf/core/groupby/groupby.py:782, in GroupBy.apply(self, function, *args)
    778         result = cudf.concat(chunk_results)
    779         if self._group_keys:
    780             result.index = cudf.MultiIndex._from_data(
    781                 {
--> 782                     group_keys.name: group_keys._column,
    783                     None: grouped_values.index._column,
    784                 }
    785             )
    787 if self._sort:
    788     result = result.sort_index()

AttributeError: 'MultiIndex' object has no attribute '_column'
```

This PR fixes the issue:
```python
In [1]: import cudf

In [2]: gdf = cudf.DataFrame(
   ...: {"A": "a a b".split(), "B": [1, 1, 3], "C": [4, 6, 5]}
   ...: )

In [3]: g_group = gdf.groupby(["A", "B"], group_keys=True)

In [4]: g_group[["B", "C"]].apply(lambda x: x / x.sum())
Out[4]:
         B    C
A B
a 1 0  0.5  0.4
    1  0.5  0.6
b 3 2  1.0  1.0
```

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11689
---
 python/cudf/cudf/core/groupby/groupby.py | 17 ++++++++---------
 python/cudf/cudf/tests/test_groupby.py   | 13 +++++++++----
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index dad0684b111..c96407a7ff9 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -584,9 +584,11 @@ def _grouped(self):
         grouped_key_cols, grouped_value_cols, offsets = self._groupby.groups(
             [*self.obj._index._columns, *self.obj._columns]
         )
-        grouped_keys = cudf.core.index._index_from_columns(
-            grouped_key_cols, name=self.grouping.keys.name
-        )
+        grouped_keys = cudf.core.index._index_from_columns(grouped_key_cols)
+        if isinstance(self.grouping.keys, cudf.MultiIndex):
+            grouped_keys.names = self.grouping.keys.names
+        else:
+            grouped_keys.name = self.grouping.keys.name
         grouped_values = self.obj._from_columns_like_self(
             grouped_value_cols,
             column_names=self.obj._column_names,
@@ -777,12 +779,9 @@ def mult(df):
             else:
                 result = cudf.concat(chunk_results)
                 if self._group_keys:
-                    result.index = cudf.MultiIndex._from_data(
-                        {
-                            group_keys.name: group_keys._column,
-                            None: grouped_values.index._column,
-                        }
-                    )
+                    index_data = group_keys._data.copy(deep=True)
+                    index_data[None] = grouped_values.index._column
+                    result.index = cudf.MultiIndex._from_data(index_data)
 
         if self._sort:
             result = result.sort_index()
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index c1be9cdb290..911b1d5443e 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -2698,14 +2698,19 @@ def test_groupby_pct_change_empty_columns():
         False,
     ],
 )
-def test_groupby_group_keys(group_keys):
+@pytest.mark.parametrize("by", ["A", ["A", "B"]])
+def test_groupby_group_keys(group_keys, by):
     gdf = cudf.DataFrame(
-        {"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}
+        {
+            "A": "a a a a b b".split(),
+            "B": [1, 1, 2, 2, 3, 3],
+            "C": [4, 6, 5, 9, 8, 7],
+        }
     )
     pdf = gdf.to_pandas()
 
-    g_group = gdf.groupby("A", group_keys=group_keys)
-    p_group = pdf.groupby("A", group_keys=group_keys)
+    g_group = gdf.groupby(by, group_keys=group_keys)
+    p_group = pdf.groupby(by, group_keys=group_keys)
 
     actual = g_group[["B", "C"]].apply(lambda x: x / x.sum())
     expected = p_group[["B", "C"]].apply(lambda x: x / x.sum())

From 7b0d597678028b035abe7e966f6ba44eb73dc087 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 12 Sep 2022 17:09:54 -0700
Subject: [PATCH 16/25] Fix encode/decode of negative timestamps in ORC
 reader/writer (#11586)

Fixes #11525

Contains a chain of fixes:

1. Allow negative nanoseconds in negative timestamps - aligns writer with pyorc;
2. Limit seconds adjustment to positive nanoseconds - fixes the off-by-one issue reported in #11525;
3. Fix the decode of large uint64_t values (>max `int64_t`) - fixes reading of cuDF encoded negative nanoseconds;
4. Avoid mode 2 encode when the base value is larger than max `int64_t` - follows the specs and fixes reading of negative nanoseconds using non-cuDF readers.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11586
---
 cpp/src/io/orc/stripe_data.cu      | 27 +++++++++++++++---------
 cpp/src/io/orc/stripe_enc.cu       | 19 +++++++----------
 python/cudf/cudf/tests/test_orc.py | 33 ++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index a4cd5de8ec8..4fa407f4e88 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -745,17 +745,21 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
             uint32_t bw    = 1 + (byte2 >> 5);        // base value width, 1 to 8 bytes
             uint32_t pw    = kRLEv2_W[byte2 & 0x1f];  // patch width, 1 to 64 bits
             if constexpr (sizeof(T) <= 4) {
-              uint32_t baseval, mask;
+              uint32_t baseval;
               bytestream_readbe(bs, pos * 8, bw * 8, baseval);
-              mask                = (1 << (bw * 8 - 1)) - 1;
-              rle->baseval.u32[r] = (baseval > mask) ? (-(int32_t)(baseval & mask)) : baseval;
+              uint32_t const mask = (1u << (bw * 8 - 1)) - 1;
+              // Negative values are represented with the highest bit set to 1
+              rle->baseval.u32[r] = (std::is_signed_v<T> and baseval > mask)
+                                      ? -static_cast<int32_t>(baseval & mask)
+                                      : baseval;
             } else {
-              uint64_t baseval, mask;
+              uint64_t baseval;
               bytestream_readbe(bs, pos * 8, bw * 8, baseval);
-              mask = 1;
-              mask <<= (bw * 8) - 1;
-              mask -= 1;
-              rle->baseval.u64[r] = (baseval > mask) ? (-(int64_t)(baseval & mask)) : baseval;
+              uint64_t const mask = (1ul << (bw * 8 - 1)) - 1;
+              // Negative values are represented with the highest bit set to 1
+              rle->baseval.u64[r] = (std::is_signed_v<T> and baseval > mask)
+                                      ? -static_cast<int64_t>(baseval & mask)
+                                      : baseval;
             }
             rle->m2_pw_byte3[r] = (pw << 8) | byte3;
             pos += bw;
@@ -1758,12 +1762,15 @@ __global__ void __launch_bounds__(block_size)
             }
             case TIMESTAMP: {
               int64_t seconds = s->vals.i64[t + vals_skipped] + s->top.data.utc_epoch;
-              uint64_t nanos  = secondary_val;
+              int64_t nanos   = secondary_val;
               nanos           = (nanos >> 3) * kTimestampNanoScale[nanos & 7];
               if (!tz_table.ttimes.empty()) {
                 seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds);
               }
-              if (seconds < 0 && nanos != 0) { seconds -= 1; }
+              // Adjust seconds only for negative timestamps with positive nanoseconds.
+              // Alternative way to represent negative timestamps is with negative nanoseconds
+              // in which case the adjustment in not needed.
+              if (seconds < 0 && nanos > 0) { seconds -= 1; }
 
               duration_ns d_ns{nanos};
               duration_s d_s{seconds};
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index b1c04099e64..ef4bdd421fb 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -444,7 +444,9 @@ static __device__ uint32_t IntegerRLE(
           s->u.intrle.literal_w    = bytecnt;
         } else {
           uint32_t range, w;
-          if (mode1_w > mode2_w && (literal_run - 1) * (mode1_w - mode2_w) > 4) {
+          // Mode 2 base value cannot be bigger than max int64_t, i.e. the first bit has to be 0
+          if (vmin <= std::numeric_limits<int64_t>::max() and mode1_w > mode2_w and
+              (literal_run - 1) * (mode1_w - mode2_w) > 4) {
             s->u.intrle.literal_mode = 2;
             w                        = mode2_w;
             range                    = (uint32_t)vrange_mode2;
@@ -808,17 +810,10 @@ __global__ void __launch_bounds__(block_size)
           case BOOLEAN:
           case BYTE: s->vals.u8[nz_idx] = column.element<uint8_t>(row); break;
           case TIMESTAMP: {
-            int64_t ts       = column.element<int64_t>(row);
-            int32_t ts_scale = powers_of_ten[9 - min(s->chunk.scale, 9)];
-            int64_t seconds  = ts / ts_scale;
-            int64_t nanos    = (ts - seconds * ts_scale);
-            // There is a bug in the ORC spec such that for negative timestamps, it is understood
-            // between the writer and reader that nanos will be adjusted to their positive component
-            // but the negative seconds will be left alone. This means that -2.6 is encoded as
-            // seconds = -2 and nanos = 1+(-0.6) = 0.4
-            // This leads to an error in decoding time where -1 < time (s) < 0
-            // Details: https://github.com/rapidsai/cudf/pull/5529#issuecomment-648768925
-            if (nanos < 0) { nanos += ts_scale; }
+            int64_t ts          = column.element<int64_t>(row);
+            int32_t ts_scale    = powers_of_ten[9 - min(s->chunk.scale, 9)];
+            int64_t seconds     = ts / ts_scale;
+            int64_t nanos       = (ts - seconds * ts_scale);
             s->vals.i64[nz_idx] = seconds - kORCTimeToUTC;
             if (nanos != 0) {
               // Trailing zeroes are encoded in the lower 3-bits
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index c2188003531..18d159bc423 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1838,3 +1838,36 @@ def test_orc_writer_cols_as_map_type_error():
         TypeError, match="cols_as_map_type must be a list of column names."
     ):
         df.to_orc(buffer, cols_as_map_type=1)
+
+
+@pytest.fixture
+def negative_timestamp_df():
+    return cudf.DataFrame(
+        {
+            "a": [
+                pd.Timestamp("1969-12-31 23:59:59.000123"),
+                pd.Timestamp("1969-12-31 23:59:58.000999"),
+                pd.Timestamp("1969-12-31 23:59:58.001001"),
+                pd.Timestamp("1839-12-24 03:58:56.000826"),
+            ]
+        }
+    )
+
+
+@pytest.mark.parametrize("engine", ["cudf", "pyarrow"])
+def test_orc_reader_negative_timestamp(negative_timestamp_df, engine):
+    buffer = BytesIO()
+    pyorc_table = pa.Table.from_pandas(
+        negative_timestamp_df.to_pandas(), preserve_index=False
+    )
+    pyarrow.orc.write_table(pyorc_table, buffer)
+
+    assert_eq(negative_timestamp_df, cudf.read_orc(buffer, engine=engine))
+
+
+def test_orc_writer_negative_timestamp(negative_timestamp_df):
+    buffer = BytesIO()
+    negative_timestamp_df.to_orc(buffer)
+
+    assert_eq(negative_timestamp_df, pd.read_orc(buffer))
+    assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read())

From 90cd2dc6ad1a7d621b80635ce4a3e99edffac823 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 13 Sep 2022 07:57:39 +0530
Subject: [PATCH 17/25] review comments (davidwendt)

---
 cpp/src/io/json/nested_json.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 94c51679c76..fca9a3ecc42 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -24,7 +24,9 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
+#include <map>
 #include <vector>
 
 namespace cudf::io::json {
@@ -308,6 +310,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
  * @param tokens Vector of token types in the json string
  * @param token_indices The indices within the input string corresponding to each token
  * @param stream The CUDA stream to which kernels are dispatched
+ * @param mr Optional, resource with which to allocate
  * @return A tree representation of the input JSON string as vectors of node type, parent index,
  * level, begin index, and end index in the input JSON string
  */

From 7e86a1b73d357e83b2d0bb166ac3a0fdbf77fa99 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 13 Sep 2022 12:25:35 -0700
Subject: [PATCH 18/25] Default to Snappy compression in `to_orc` when using
 cuDF or Dask (#11690)

Fix `to_orc` defaults for the compression type in cuDF and Dask. Aligns the default to the libcudf default (and to the Parquet default).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/11690
---
 python/cudf/cudf/_lib/orc.pyx        | 4 ++--
 python/cudf/cudf/_lib/parquet.pyx    | 2 +-
 python/cudf/cudf/core/dataframe.py   | 2 +-
 python/cudf/cudf/io/orc.py           | 2 +-
 python/cudf/cudf/io/parquet.py       | 2 +-
 python/cudf/cudf/utils/ioutils.py    | 6 +++---
 python/dask_cudf/dask_cudf/io/orc.py | 4 ++--
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 1c9f388873c..be7b29da515 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -242,7 +242,7 @@ cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics):
 
 cpdef write_orc(table,
                 object path_or_buf,
-                object compression=None,
+                object compression="snappy",
                 object statistics="ROWGROUP",
                 object stripe_size_bytes=None,
                 object stripe_size_rows=None,
@@ -381,7 +381,7 @@ cdef class ORCWriter:
     def __cinit__(self,
                   object path,
                   object index=None,
-                  object compression=None,
+                  object compression="snappy",
                   object statistics="ROWGROUP",
                   object cols_as_map_type=None):
 
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 3c8e78bd87a..891f259a828 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -487,7 +487,7 @@ cdef class ParquetWriter:
     cdef size_type max_page_size_rows
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
-                  object compression=None, str statistics="ROWGROUP",
+                  object compression="snappy", str statistics="ROWGROUP",
                   int row_group_size_bytes=134217728,
                   int row_group_size_rows=1000000,
                   int max_page_size_bytes=524288,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index c07a88e9396..2f1695e4445 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6012,7 +6012,7 @@ def to_csv(
         )
 
     @ioutils.doc_to_orc()
-    def to_orc(self, fname, compression=None, *args, **kwargs):
+    def to_orc(self, fname, compression="snappy", *args, **kwargs):
         """{docstring}"""
         from cudf.io import orc
 
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 378cb25fafb..718b9c4144f 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -407,7 +407,7 @@ def read_orc_stripe(orc_file, stripe, columns):
 def to_orc(
     df,
     fname,
-    compression=None,
+    compression="snappy",
     statistics="ROWGROUP",
     stripe_size_bytes=None,
     stripe_size_rows=None,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 4fab657d9a0..7ac391c5f3d 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -911,7 +911,7 @@ def __init__(
         path,
         partition_cols,
         index=None,
-        compression=None,
+        compression="snappy",
         statistics="ROWGROUP",
         max_file_size=None,
         file_name_prefix=None,
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 19815c7c506..fb1b0235822 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -429,7 +429,7 @@
 ----------
 fname : str
     File path or object where the ORC dataset will be stored.
-compression : {{ 'snappy', 'ZLIB', 'ZSTD', None }}, default None
+compression : {{ 'snappy', 'ZLIB', 'ZSTD', None }}, default 'snappy'
     Name of the compression to use. Use None for no compression.
 enable_statistics: boolean, default True
     Enable writing column statistics.
@@ -1013,10 +1013,10 @@
 line_terminator : char, default '\\n'
 chunksize : int or None, default None
     Rows to write at a time
-encoding: str, default 'utf-8'
+encoding : str, default 'utf-8'
     A string representing the encoding to use in the output file
     Only ‘utf-8’ is currently supported
-compression: str, None
+compression : str, None
     A string representing the compression scheme to use in the the output file
     Compression while writing csv is not supported currently
 Returns
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index f5df0e261c9..e731057ed90 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -115,7 +115,7 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
     return dd.core.new_dd_object(dsk, name, meta, divisions)
 
 
-def write_orc_partition(df, path, fs, filename, compression=None):
+def write_orc_partition(df, path, fs, filename, compression="snappy"):
     full_path = fs.sep.join([path, filename])
     with fs.open(full_path, mode="wb") as out_file:
         if not isinstance(out_file, IOBase):
@@ -129,7 +129,7 @@ def to_orc(
     path,
     write_index=True,
     storage_options=None,
-    compression=None,
+    compression="snappy",
     compute=True,
     **kwargs,
 ):

From 69cb31d3aeabc7b51c8ea86fe94a086c616175cf Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Tue, 13 Sep 2022 14:27:45 -0700
Subject: [PATCH 19/25] Support DECIMAL order-by for RANGE window functions
 (#11645)

CUDF grouped RANGE window functions currently support only
integral types and timestamps as the ORDER BY (OBY) column.

This commit adds support for DECIMAL types (i.e. decimal32,
decimal64, and decimal128) to be used as the ORDER BY
column in RANGE window functions.

This feature allows `spark-rapids` to address https://github.com/NVIDIA/spark-rapids/issues/6400.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - David Wendt (https://github.com/davidwendt)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/11645
---
 .../rolling/detail/range_window_bounds.hpp    |  67 ++--
 cpp/src/rolling/grouped_rolling.cu            |  40 +--
 cpp/src/rolling/range_window_bounds.cpp       |  16 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 .../rolling/grouped_rolling_range_test.cpp    | 285 ++++++++++++++++++
 .../rolling/range_window_bounds_test.cpp      |  57 ++++
 java/src/main/java/ai/rapids/cudf/Scalar.java |   7 +-
 java/src/main/java/ai/rapids/cudf/Table.java  |   3 +
 .../test/java/ai/rapids/cudf/TableTest.java   | 119 ++++++++
 9 files changed, 552 insertions(+), 43 deletions(-)
 create mode 100644 cpp/tests/rolling/grouped_rolling_range_test.cpp

diff --git a/cpp/src/rolling/detail/range_window_bounds.hpp b/cpp/src/rolling/detail/range_window_bounds.hpp
index 252e7fd7b67..266f397b1e3 100644
--- a/cpp/src/rolling/detail/range_window_bounds.hpp
+++ b/cpp/src/rolling/detail/range_window_bounds.hpp
@@ -28,7 +28,7 @@ namespace detail {
 template <typename RangeType>
 constexpr bool is_supported_range_type()
 {
-  return cudf::is_duration<RangeType>() ||
+  return cudf::is_duration<RangeType>() || cudf::is_fixed_point<RangeType>() ||
          (std::is_integral_v<RangeType> && !cudf::is_boolean<RangeType>());
 }
 
@@ -37,7 +37,7 @@ constexpr bool is_supported_range_type()
 template <typename ColumnType>
 constexpr bool is_supported_order_by_column_type()
 {
-  return cudf::is_timestamp<ColumnType>() ||
+  return cudf::is_timestamp<ColumnType>() || cudf::is_fixed_point<ColumnType>() ||
          (std::is_integral_v<ColumnType> && !cudf::is_boolean<ColumnType>());
 }
 
@@ -49,6 +49,11 @@ constexpr bool is_supported_order_by_column_type()
 ///      a. For `TIMESTAMP_DAYS`, the range-type is `DURATION_DAYS`.
 ///         Comparisons are done in `int32_t`.
 ///      b. For all other timestamp types, comparisons are done in `int64_t`.
+///   3. For decimal types, all comparisons are done with the rep type,
+///      after scaling the rep value to the same scale as the order by column:
+///      a. For decimal32, the range-type is `int32_t`.
+///      b. For decimal64, the range-type is `int64_t`.
+///      c. For decimal128, the range-type is `__int128_t`.
 template <typename ColumnType, typename = void>
 struct range_type_impl {
   using type     = void;
@@ -69,45 +74,66 @@ struct range_type_impl<TimestampType, std::enable_if_t<cudf::is_timestamp<Timest
   using rep_type = typename type::rep;
 };
 
+template <typename FixedPointType>
+struct range_type_impl<FixedPointType,
+                       std::enable_if_t<cudf::is_fixed_point<FixedPointType>(), void>> {
+  using type     = FixedPointType;
+  using rep_type = typename type::rep;
+};
+
 template <typename ColumnType>
 using range_type = typename range_type_impl<ColumnType>::type;
 
 template <typename ColumnType>
 using range_rep_type = typename range_type_impl<ColumnType>::rep_type;
 
-namespace {
-
 template <typename T>
-void assert_non_negative(T const& value)
+void assert_non_negative([[maybe_unused]] T const& value)
 {
-  (void)value;
   if constexpr (std::numeric_limits<T>::is_signed) {
     CUDF_EXPECTS(value >= T{0}, "Range scalar must be >= 0.");
   }
 }
 
-template <
-  typename RangeT,
-  typename RepT,
-  std::enable_if_t<std::is_integral_v<RangeT> && !cudf::is_boolean<RangeT>(), void>* = nullptr>
-RepT range_comparable_value_impl(scalar const& range_scalar, rmm::cuda_stream_view stream)
+template <typename RangeT,
+          typename RepT,
+          CUDF_ENABLE_IF(std::is_integral_v<RangeT> && !cudf::is_boolean<RangeT>())>
+RepT range_comparable_value_impl(scalar const& range_scalar,
+                                 bool,
+                                 data_type const&,
+                                 rmm::cuda_stream_view stream)
 {
   auto val = static_cast<numeric_scalar<RangeT> const&>(range_scalar).value(stream);
   assert_non_negative(val);
   return val;
 }
 
-template <typename RangeT,
-          typename RepT,
-          std::enable_if_t<cudf::is_duration<RangeT>(), void>* = nullptr>
-RepT range_comparable_value_impl(scalar const& range_scalar, rmm::cuda_stream_view stream)
+template <typename RangeT, typename RepT, CUDF_ENABLE_IF(cudf::is_duration<RangeT>())>
+RepT range_comparable_value_impl(scalar const& range_scalar,
+                                 bool,
+                                 data_type const&,
+                                 rmm::cuda_stream_view stream)
 {
   auto val = static_cast<duration_scalar<RangeT> const&>(range_scalar).value(stream).count();
   assert_non_negative(val);
   return val;
 }
 
-}  // namespace
+template <typename RangeT, typename RepT, CUDF_ENABLE_IF(cudf::is_fixed_point<RangeT>())>
+RepT range_comparable_value_impl(scalar const& range_scalar,
+                                 bool is_unbounded,
+                                 data_type const& order_by_data_type,
+                                 rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(is_unbounded || range_scalar.type().scale() >= order_by_data_type.scale(),
+               "Range bounds scalar must match/exceed the scale of the orderby column.");
+  auto const fixed_point_value =
+    static_cast<fixed_point_scalar<RangeT> const&>(range_scalar).fixed_point_value(stream);
+  auto const value =
+    fixed_point_value.rescaled(numeric::scale_type{order_by_data_type.scale()}).value();
+  assert_non_negative(value);
+  return value;
+}
 
 /**
  * @brief Fetch the value of the range_window_bounds scalar, for comparisons
@@ -115,22 +141,25 @@ RepT range_comparable_value_impl(scalar const& range_scalar, rmm::cuda_stream_vi
  *
  * @tparam OrderByType The type of the orderby column with which the range value will be compared
  * @param range_bounds The range_window_bounds whose value is to be read
+ * @param order_by_data_type The data type for the order-by column
  * @param stream The CUDA stream for device memory operations
  * @return RepType Value of the range scalar
  */
 template <typename OrderByType>
 range_rep_type<OrderByType> range_comparable_value(
   range_window_bounds const& range_bounds,
-  rmm::cuda_stream_view stream = cudf::default_stream_value)
+  data_type const& order_by_data_type = data_type{type_to_id<OrderByType>()},
+  rmm::cuda_stream_view stream        = cudf::default_stream_value)
 {
   auto const& range_scalar = range_bounds.range_scalar();
   using range_type         = cudf::detail::range_type<OrderByType>;
 
   CUDF_EXPECTS(range_scalar.type().id() == cudf::type_to_id<range_type>(),
-               "Unexpected range type for specified orderby column.");
+               "Range bounds scalar must match the type of the orderby column.");
 
   using rep_type = cudf::detail::range_rep_type<OrderByType>;
-  return range_comparable_value_impl<range_type, rep_type>(range_scalar, stream);
+  return range_comparable_value_impl<range_type, rep_type>(
+    range_scalar, range_bounds.is_unbounded(), order_by_data_type, stream);
 }
 
 }  // namespace detail
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 52587a20fc7..c1be33a9cd5 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -224,47 +224,49 @@ namespace {
 /**
  * @brief Add `delta` to value, and cap at numeric_limits::max(), for signed types.
  */
-template <typename T, std::enable_if_t<std::numeric_limits<T>::is_signed>* = nullptr>
+template <typename T, CUDF_ENABLE_IF(cuda::std::numeric_limits<T>::is_signed)>
 __device__ T add_safe(T const& value, T const& delta)
 {
   // delta >= 0.
-  return (value < 0 || (std::numeric_limits<T>::max() - value) >= delta)
+  return (value < 0 || (cuda::std::numeric_limits<T>::max() - value) >= delta)
            ? (value + delta)
-           : std::numeric_limits<T>::max();
+           : cuda::std::numeric_limits<T>::max();
 }
 
 /**
  * @brief Add `delta` to value, and cap at numeric_limits::max(), for unsigned types.
  */
-template <typename T, std::enable_if_t<!std::numeric_limits<T>::is_signed>* = nullptr>
+template <typename T, CUDF_ENABLE_IF(not cuda::std::numeric_limits<T>::is_signed)>
 __device__ T add_safe(T const& value, T const& delta)
 {
   // delta >= 0.
-  return ((std::numeric_limits<T>::max() - value) >= delta) ? (value + delta)
-                                                            : std::numeric_limits<T>::max();
+  return ((cuda::std::numeric_limits<T>::max() - value) >= delta)
+           ? (value + delta)
+           : cuda::std::numeric_limits<T>::max();
 }
 
 /**
  * @brief Subtract `delta` from value, and cap at numeric_limits::min(), for signed types.
  */
-template <typename T, std::enable_if_t<std::numeric_limits<T>::is_signed>* = nullptr>
+template <typename T, CUDF_ENABLE_IF(cuda::std::numeric_limits<T>::is_signed)>
 __device__ T subtract_safe(T const& value, T const& delta)
 {
   // delta >= 0;
-  return (value >= 0 || (value - std::numeric_limits<T>::min()) >= delta)
+  return (value >= 0 || (value - cuda::std::numeric_limits<T>::min()) >= delta)
            ? (value - delta)
-           : std::numeric_limits<T>::min();
+           : cuda::std::numeric_limits<T>::min();
 }
 
 /**
  * @brief Subtract `delta` from value, and cap at numeric_limits::min(), for unsigned types.
  */
-template <typename T, std::enable_if_t<!std::numeric_limits<T>::is_signed>* = nullptr>
+template <typename T, CUDF_ENABLE_IF(not cuda::std::numeric_limits<T>::is_signed)>
 __device__ T subtract_safe(T const& value, T const& delta)
 {
   // delta >= 0;
-  return ((value - std::numeric_limits<T>::min()) >= delta) ? (value - delta)
-                                                            : std::numeric_limits<T>::min();
+  return ((value - cuda::std::numeric_limits<T>::min()) >= delta)
+           ? (value - delta)
+           : cuda::std::numeric_limits<T>::min();
 }
 
 /// Given a single, ungrouped order-by column, return the indices corresponding
@@ -780,7 +782,7 @@ template <typename OrderByT>
 std::unique_ptr<column> grouped_range_rolling_window_impl(
   column_view const& input,
   column_view const& orderby_column,
-  cudf::order const& timestamp_ordering,
+  cudf::order const& order_of_orderby_column,
   rmm::device_uvector<cudf::size_type> const& group_offsets,
   rmm::device_uvector<cudf::size_type> const& group_labels,
   range_window_bounds const& preceding_window,
@@ -790,10 +792,12 @@ std::unique_ptr<column> grouped_range_rolling_window_impl(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  auto preceding_value = detail::range_comparable_value<OrderByT>(preceding_window);
-  auto following_value = detail::range_comparable_value<OrderByT>(following_window);
+  auto preceding_value =
+    detail::range_comparable_value<OrderByT>(preceding_window, orderby_column.type(), stream);
+  auto following_value =
+    detail::range_comparable_value<OrderByT>(following_window, orderby_column.type(), stream);
 
-  if (timestamp_ordering == cudf::order::ASCENDING) {
+  if (order_of_orderby_column == cudf::order::ASCENDING) {
     return group_offsets.is_empty() ? range_window_ASC(input,
                                                        orderby_column,
                                                        preceding_value,
@@ -856,7 +860,7 @@ struct dispatch_grouped_range_rolling_window {
                    std::unique_ptr<column>>
   operator()(column_view const& input,
              column_view const& orderby_column,
-             cudf::order const& timestamp_ordering,
+             cudf::order const& order_of_orderby_column,
              rmm::device_uvector<cudf::size_type> const& group_offsets,
              rmm::device_uvector<cudf::size_type> const& group_labels,
              range_window_bounds const& preceding_window,
@@ -868,7 +872,7 @@ struct dispatch_grouped_range_rolling_window {
   {
     return grouped_range_rolling_window_impl<OrderByColumnType>(input,
                                                                 orderby_column,
-                                                                timestamp_ordering,
+                                                                order_of_orderby_column,
                                                                 group_offsets,
                                                                 group_labels,
                                                                 preceding_window,
diff --git a/cpp/src/rolling/range_window_bounds.cpp b/cpp/src/rolling/range_window_bounds.cpp
index 831e901f652..77520ccff63 100644
--- a/cpp/src/rolling/range_window_bounds.cpp
+++ b/cpp/src/rolling/range_window_bounds.cpp
@@ -30,28 +30,34 @@ namespace {
  * This makes it possible to copy construct and copy assign `range_window_bounds` objects.
  */
 struct range_scalar_constructor {
-  template <typename T, std::enable_if_t<!detail::is_supported_range_type<T>(), void>* = nullptr>
+  template <typename T, CUDF_ENABLE_IF(not detail::is_supported_range_type<T>())>
   std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
   {
     CUDF_FAIL(
       "Unsupported range type. "
-      "Only Durations and non-boolean integral range types are allowed.");
+      "Only Durations, fixed-point, and non-boolean integral range types are allowed.");
   }
 
-  template <typename T, std::enable_if_t<cudf::is_duration<T>(), void>* = nullptr>
+  template <typename T, CUDF_ENABLE_IF(cudf::is_duration<T>())>
   std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
   {
     return std::make_unique<duration_scalar<T>>(
       static_cast<duration_scalar<T> const&>(range_scalar_));
   }
 
-  template <typename T,
-            std::enable_if_t<std::is_integral_v<T> && !cudf::is_boolean<T>(), void>* = nullptr>
+  template <typename T, CUDF_ENABLE_IF(std::is_integral_v<T> && not cudf::is_boolean<T>())>
   std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
   {
     return std::make_unique<numeric_scalar<T>>(
       static_cast<numeric_scalar<T> const&>(range_scalar_));
   }
+
+  template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_point<T>())>
+  std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
+  {
+    return std::make_unique<fixed_point_scalar<T>>(
+      static_cast<fixed_point_scalar<T> const&>(range_scalar_));
+  }
 };
 
 }  // namespace
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 6a36e616268..3710bc6cdfa 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -346,6 +346,7 @@ ConfigureTest(
   ROLLING_TEST
   rolling/collect_ops_test.cpp
   rolling/empty_input_test.cpp
+  rolling/grouped_rolling_range_test.cpp
   rolling/grouped_rolling_test.cpp
   rolling/lead_lag_test.cpp
   rolling/nth_element_test.cpp
diff --git a/cpp/tests/rolling/grouped_rolling_range_test.cpp b/cpp/tests/rolling/grouped_rolling_range_test.cpp
new file mode 100644
index 00000000000..c44d804dcf1
--- /dev/null
+++ b/cpp/tests/rolling/grouped_rolling_range_test.cpp
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <src/rolling/detail/rolling.hpp>
+
+#include <thrust/host_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <algorithm>
+#include <vector>
+
+namespace cudf::test::rolling {
+
+template <typename T>
+using fwcw = cudf::test::fixed_width_column_wrapper<T>;
+template <typename T>
+using decimals_column = cudf::test::fixed_point_column_wrapper<T>;
+using ints_column     = fwcw<int32_t>;
+using bigints_column  = fwcw<int64_t>;
+using column_ptr      = std::unique_ptr<cudf::column>;
+using namespace numeric;
+using namespace cudf::test::iterators;
+
+struct BaseGroupedRollingRangeOrderByDecimalTest : public BaseFixture {
+  // Stand-in for std::pow(10, n), but for integral return.
+  static constexpr std::array<int32_t, 6> pow10{1, 10, 100, 1000, 10000, 100000};
+  // Test data.
+  column_ptr const grouping_keys = ints_column{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2}.release();
+  column_ptr const agg_values    = ints_column{1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}.release();
+  cudf::size_type const num_rows = grouping_keys->size();
+};
+
+using base = BaseGroupedRollingRangeOrderByDecimalTest;  // Shortcut to base test class.
+
+template <typename DecimalT>
+struct GroupedRollingRangeOrderByDecimalTypedTest : BaseGroupedRollingRangeOrderByDecimalTest {
+  using Rep = typename DecimalT::rep;
+
+  auto make_fixed_point_range_bounds(typename DecimalT::rep value, scale_type scale) const
+  {
+    return cudf::range_window_bounds::get(*cudf::make_fixed_point_scalar<DecimalT>(value, scale));
+  }
+
+  auto make_unbounded_fixed_point_range_bounds() const
+  {
+    return cudf::range_window_bounds::unbounded(data_type{type_to_id<DecimalT>()});
+  }
+
+  /// For different scales, generate order_by column with
+  /// the same effective values:           [0, 100,   200,   300,   ... 1100,   1200,   1300]
+  /// For scale == -2, the rep values are: [0, 10000, 20000, 30000, ... 110000, 120000, 130000]
+  /// For scale ==  2, the rep values are: [0, 1,     2,     3,     ... 11,     12,     13]
+  column_ptr generate_order_by_column(scale_type scale) const
+  {
+    auto const begin = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<Rep>(0),
+      [&](auto i) -> Rep { return (i * 10000) / base::pow10[scale + 2]; });
+
+    return decimals_column<Rep>{begin, begin + num_rows, scale_type{scale}}.release();
+  }
+
+  /**
+   * @brief Scale the range bounds value to new scale, so that effective
+   * value remains identical.
+   *
+   * Keeping the effective range bounds value identical ensures that
+   * the expected result from grouped_rolling remains the same.
+   */
+  Rep rescale_range_value(Rep const& value_at_scale_0, scale_type new_scale) const
+  {
+    // Scale  ->   Rep (for value == 200)
+    //  -2    ->       20000
+    //  -1    ->       2000
+    //   0    ->       200
+    //   1    ->       20
+    //   2    ->       2
+    return (value_at_scale_0 * 100) / base::pow10[new_scale + 2];
+  }
+
+  /**
+   * @brief Get grouped rolling results for specified order-by column and range scale
+   *
+   */
+  column_ptr get_grouped_range_rolling_result(column_view const& order_by_column,
+                                              scale_type const& range_scale) const
+  {
+    auto const preceding =
+      this->make_fixed_point_range_bounds(rescale_range_value(Rep{200}, range_scale), range_scale);
+    auto const following =
+      this->make_fixed_point_range_bounds(rescale_range_value(Rep{100}, range_scale), range_scale);
+
+    return cudf::grouped_range_rolling_window(cudf::table_view{{grouping_keys->view()}},
+                                              order_by_column,
+                                              cudf::order::ASCENDING,
+                                              agg_values->view(),
+                                              preceding,
+                                              following,
+                                              1,  // min_periods
+                                              *cudf::make_sum_aggregation<rolling_aggregation>());
+  }
+
+  /**
+   * @brief Run grouped_rolling test for specified order-by column scale with
+   * no nulls in the order-by column
+   *
+   */
+  void run_test_no_null_oby(scale_type const& order_by_column_scale) const
+  {
+    auto const order_by = generate_order_by_column(order_by_column_scale);
+    // Run tests for range bounds generated for all scales >= oby_column_scale.
+    for (int32_t range_scale = order_by_column_scale; range_scale <= 2; ++range_scale) {
+      auto const results = get_grouped_range_rolling_result(*order_by, scale_type{range_scale});
+      auto const expected_results =
+        bigints_column{{2, 3, 4, 4, 4, 3, 4, 6, 8, 6, 6, 9, 12, 9}, no_nulls()};
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+    }
+  }
+
+  /**
+   * @brief Run grouped_rolling test for specified order-by column scale with
+   * nulls in the order-by column (i.e. 2 nulls at the beginning of each group)
+   *
+   */
+  void run_test_nulls_in_oby(scale_type const& order_by_column_scale) const
+  {
+    // Nullify the first two rows of each group in the order_by column.
+    auto const nulled_order_by = [&] {
+      auto col           = generate_order_by_column(order_by_column_scale);
+      auto new_null_mask = create_null_mask(col->size(), mask_state::ALL_VALID);
+      set_null_mask(
+        static_cast<bitmask_type*>(new_null_mask.data()), 0, 2, false);  // Nulls in first group.
+      set_null_mask(
+        static_cast<bitmask_type*>(new_null_mask.data()), 6, 8, false);  // Nulls in second group.
+      set_null_mask(
+        static_cast<bitmask_type*>(new_null_mask.data()), 10, 12, false);  // Nulls in third group.
+      col->set_null_mask(std::move(new_null_mask));
+      return col;
+    }();
+
+    // Run tests for range bounds generated for all scales >= oby_column_scale.
+    for (auto range_scale = int32_t{order_by_column_scale}; range_scale <= 2; ++range_scale) {
+      auto const results =
+        get_grouped_range_rolling_result(*nulled_order_by, scale_type{range_scale});
+      auto const expected_results =
+        bigints_column{{2, 2, 2, 3, 4, 3, 4, 4, 4, 4, 6, 6, 6, 6}, no_nulls()};
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+    }
+  }
+
+  /**
+   * @brief Run grouped_rolling test for specified order-by column scale with
+   * unbounded preceding and unbounded following.
+   *
+   */
+  void run_test_unbounded_preceding_to_unbounded_following(scale_type oby_column_scale)
+  {
+    auto const order_by  = generate_order_by_column(oby_column_scale);
+    auto const preceding = make_unbounded_fixed_point_range_bounds();
+    auto const following = make_unbounded_fixed_point_range_bounds();
+    auto results =
+      cudf::grouped_range_rolling_window(cudf::table_view{{grouping_keys->view()}},
+                                         order_by->view(),
+                                         cudf::order::ASCENDING,
+                                         agg_values->view(),
+                                         preceding,
+                                         following,
+                                         1,  // min_periods
+                                         *cudf::make_sum_aggregation<rolling_aggregation>());
+
+    auto expected_results =
+      bigints_column{{6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 12, 12, 12, 12}, no_nulls()};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+  }
+
+  /**
+   * @brief Run grouped_rolling test for specified order-by column scale with
+   * unbounded preceding and unbounded following.
+   *
+   */
+  void run_test_unbounded_preceding_to_current_row(scale_type oby_column_scale)
+  {
+    auto const order_by            = generate_order_by_column(oby_column_scale);
+    auto const unbounded_preceding = make_unbounded_fixed_point_range_bounds();
+
+    for (int32_t range_scale = oby_column_scale; range_scale <= 2; ++range_scale) {
+      auto const current_row = make_fixed_point_range_bounds(
+        rescale_range_value(Rep{0}, scale_type{range_scale}), scale_type{range_scale});
+      auto const results =
+        cudf::grouped_range_rolling_window(cudf::table_view{{grouping_keys->view()}},
+                                           order_by->view(),
+                                           cudf::order::ASCENDING,
+                                           agg_values->view(),
+                                           unbounded_preceding,
+                                           current_row,
+                                           1,  // min_periods
+                                           *cudf::make_sum_aggregation<rolling_aggregation>());
+
+      auto expected_results =
+        bigints_column{{1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 3, 6, 9, 12}, no_nulls()};
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+    }
+  }
+
+  /**
+   * @brief Run grouped_rolling test for specified order-by column scale with
+   * unbounded preceding and unbounded following.
+   *
+   */
+  void run_test_current_row_to_unbounded_following(scale_type oby_column_scale)
+  {
+    auto const order_by            = generate_order_by_column(oby_column_scale);
+    auto const unbounded_following = make_unbounded_fixed_point_range_bounds();
+
+    for (int32_t range_scale = oby_column_scale; range_scale <= 2; ++range_scale) {
+      auto const current_row = make_fixed_point_range_bounds(
+        rescale_range_value(Rep{0}, scale_type{range_scale}), scale_type{range_scale});
+      auto const results =
+        cudf::grouped_range_rolling_window(cudf::table_view{{grouping_keys->view()}},
+                                           order_by->view(),
+                                           cudf::order::ASCENDING,
+                                           agg_values->view(),
+                                           current_row,
+                                           unbounded_following,
+                                           1,  // min_periods
+                                           *cudf::make_sum_aggregation<rolling_aggregation>());
+
+      auto expected_results =
+        bigints_column{{6, 5, 4, 3, 2, 1, 8, 6, 4, 2, 12, 9, 6, 3}, no_nulls()};
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+    }
+  }
+};
+
+TYPED_TEST_SUITE(GroupedRollingRangeOrderByDecimalTypedTest, FixedPointTypes);
+
+TYPED_TEST(GroupedRollingRangeOrderByDecimalTypedTest, BoundedRanges)
+{
+  for (auto const order_by_column_scale : {-2, -1, 0, 1, 2}) {
+    auto const oby_scale = scale_type{order_by_column_scale};
+    this->run_test_no_null_oby(oby_scale);
+    this->run_test_nulls_in_oby(oby_scale);
+  }
+}
+
+TYPED_TEST(GroupedRollingRangeOrderByDecimalTypedTest, UnboundedRanges)
+{
+  for (auto const order_by_scale : {-2, -1, 0, 1, 2}) {
+    auto const order_by_column_scale = scale_type{order_by_scale};
+    this->run_test_unbounded_preceding_to_unbounded_following(order_by_column_scale);
+    this->run_test_unbounded_preceding_to_current_row(order_by_column_scale);
+    this->run_test_current_row_to_unbounded_following(order_by_column_scale);
+  }
+}
+
+}  // namespace cudf::test::rolling
diff --git a/cpp/tests/rolling/range_window_bounds_test.cpp b/cpp/tests/rolling/range_window_bounds_test.cpp
index 99b461f05ee..6372536968a 100644
--- a/cpp/tests/rolling/range_window_bounds_test.cpp
+++ b/cpp/tests/rolling/range_window_bounds_test.cpp
@@ -151,5 +151,62 @@ TYPED_TEST(NumericRangeWindowBoundsTest, WrongRangeType)
                cudf::logic_error);
 }
 
+template <typename T>
+struct DecimalRangeBoundsTest : RangeWindowBoundsTest {
+};
+
+TYPED_TEST_SUITE(DecimalRangeBoundsTest, cudf::test::FixedPointTypes);
+
+TYPED_TEST(DecimalRangeBoundsTest, BoundsConstruction)
+{
+  using namespace numeric;
+  using DecimalT = TypeParam;
+  using Rep      = cudf::detail::range_rep_type<DecimalT>;
+
+  // Interval type must match the decimal type.
+  static_assert(std::is_same_v<cudf::detail::range_type<DecimalT>, DecimalT>);
+
+  auto const range_3 =
+    range_window_bounds::get(fixed_point_scalar<DecimalT>{Rep{3}, scale_type{0}});
+  EXPECT_FALSE(range_3.is_unbounded() &&
+               "range_window_bounds constructed from scalar cannot be unbounded.");
+  EXPECT_EQ(cudf::detail::range_comparable_value<DecimalT>(range_3), Rep{3});
+
+  auto const range_unbounded = range_window_bounds::unbounded(data_type{type_to_id<DecimalT>()});
+  EXPECT_TRUE(range_unbounded.is_unbounded() &&
+              "range_window_bounds::unbounded() must return an unbounded range.");
+}
+
+TYPED_TEST(DecimalRangeBoundsTest, Rescale)
+{
+  using namespace numeric;
+  using DecimalT = TypeParam;
+  using RepT     = typename DecimalT::rep;
+
+  // Powers of 10.
+  auto constexpr pow10 = std::array{1, 10, 100, 1000, 10000, 100000};
+
+  // Check that the rep has expected values at different range scales.
+  auto const order_by_scale     = -2;
+  auto const order_by_data_type = data_type{type_to_id<DecimalT>(), order_by_scale};
+
+  for (auto const range_scale : {-2, -1, 0, 1, 2}) {
+    auto const decimal_range_bounds =
+      range_window_bounds::get(fixed_point_scalar<DecimalT>{RepT{20}, scale_type{range_scale}});
+    auto const rescaled_range_rep =
+      cudf::detail::range_comparable_value<DecimalT>(decimal_range_bounds, order_by_data_type);
+    EXPECT_EQ(rescaled_range_rep, RepT{20} * pow10[range_scale - order_by_scale]);
+  }
+
+  // Order By column scale cannot exceed range scale:
+  {
+    auto const decimal_range_bounds =
+      range_window_bounds::get(fixed_point_scalar<DecimalT>{RepT{200}, scale_type{-3}});
+    EXPECT_THROW(
+      cudf::detail::range_comparable_value<DecimalT>(decimal_range_bounds, order_by_data_type),
+      cudf::logic_error);
+  }
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/java/src/main/java/ai/rapids/cudf/Scalar.java b/java/src/main/java/ai/rapids/cudf/Scalar.java
index 205efadfe6c..2fb202a72b1 100644
--- a/java/src/main/java/ai/rapids/cudf/Scalar.java
+++ b/java/src/main/java/ai/rapids/cudf/Scalar.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -754,6 +754,8 @@ public boolean equals(Object o) {
     case TIMESTAMP_NANOSECONDS:
     case DECIMAL64:
       return getLong() == other.getLong();
+    case DECIMAL128:
+      return getBigDecimal().equals(other.getBigDecimal());
     case STRING:
       return Arrays.equals(getUTF8(), other.getUTF8());
     case LIST:
@@ -819,6 +821,9 @@ public int hashCode() {
           valueHash = v.hashCode();
         }
         break;
+      case DECIMAL128:
+        valueHash = getBigDecimal().hashCode();
+        break;
       default:
         throw new IllegalStateException("Unknown scalar type: " + type);
       }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index b2130978ff9..4bab7c1a403 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -3916,6 +3916,9 @@ public Table aggregateWindowsOverRanges(AggregationOverWindow... windowAggregate
           case TIMESTAMP_DAYS:
           case TIMESTAMP_NANOSECONDS:
           case TIMESTAMP_MICROSECONDS:
+          case DECIMAL32:
+          case DECIMAL64:
+          case DECIMAL128:
             break;
           default:
             throw new IllegalArgumentException("Expected range-based window orderBy's " +
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index a73251590c2..194c1094caf 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -50,6 +50,7 @@
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.util.*;
+import java.util.function.IntFunction;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
@@ -5772,6 +5773,124 @@ void testRangeWindowingCountUnboundedDESCWithNullsLast() {
     }
   }
 
+  /**
+   * Helper for constructing BigInteger from int
+   * @param x Integer value
+   * @return BigInteger equivalent of x
+   */
+  private static BigInteger big(int x)
+  {
+    return new BigInteger("" + x);
+  }
+
+  /**
+   * Helper to get scalar for preceding == Decimal(value),
+   * with data width depending upon the the order-by
+   * column index:
+   *   orderby_col_idx = 2 -> Decimal32
+   *   orderby_col_idx = 3 -> Decimal64
+   *   orderby_col_idx = 4 -> Decimal128
+   */
+  private static Scalar getDecimalScalarRangeBounds(int scale, int unscaledValue, int orderby_col_idx)
+  {
+    switch(orderby_col_idx)
+    {
+      case 2: return Scalar.fromDecimal(scale, unscaledValue);
+      case 3: return Scalar.fromDecimal(scale, Long.valueOf(unscaledValue));
+      case 4: return Scalar.fromDecimal(scale, big(unscaledValue));
+      default: 
+        throw new IllegalStateException("Unexpected order by column index: " 
+                                        + orderby_col_idx);
+    }
+  }
+
+  @Test
+  void testRangeWindowsWithDecimalOrderBy() {
+    try (Table unsorted = new Table.TestBuilder()
+        .column(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // GBY Key
+        .decimal32Column(-1, 4000, 3000, 2000, 1000, 
+                             4000, 3000, 2000, 1000, 
+                             4000, 3000, 2000, 1000) // Decimal OBY Key
+        .decimal64Column(-1, 4000l, 3000l, 2000l, 1000l, 
+                             4000l, 3000l, 2000l, 1000l, 
+                             4000l, 3000l, 2000l, 1000l) // Decimal OBY Key
+        .decimal128Column(-1, RoundingMode.UNNECESSARY,
+                              big(4000), big(3000), big(2000), big(1000),
+                              big(4000), big(3000), big(2000), big(1000),
+                              big(4000), big(3000), big(2000), big(1000))
+        .column(9, 1, 5, 7, 2, 8, 9, 7, 6, 6, 0, 8) // Agg Column
+        .build()) {
+
+      // Columns 2,3,4 are decimal order-by columns of type DECIMAL32, DECIMAL64, 
+      // and DECIMAL128 respectively, with similarly ordered values.
+      // In the following loop, each decimal type is tested as the order-by column,
+      // producing the same results with similar range bounds.
+      for (int decimal_oby_col_idx = 2; decimal_oby_col_idx <= 4; ++decimal_oby_col_idx) {
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), 
+                                             OrderByArg.asc(1), 
+                                             OrderByArg.asc(decimal_oby_col_idx));
+            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
+          ColumnVector sortedAggColumn = sorted.getColumn(5);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+
+          // Test Window functionality with range window (200 PRECEDING and 100 FOLLOWING)
+          try (Scalar preceding200 = getDecimalScalarRangeBounds(0, 200, decimal_oby_col_idx);
+               Scalar following100 = getDecimalScalarRangeBounds(2, 1, decimal_oby_col_idx);
+               WindowOptions window = WindowOptions.builder()
+                .minPeriods(1)
+                .window(preceding200, following100)
+                .orderByColumnIndex(decimal_oby_col_idx)
+                .build()) {
+
+            try (Table windowAggResults = sorted.groupBy(0, 1)
+                                                .aggregateWindowsOverRanges(RollingAggregation.count()
+                                                                                              .onColumn(5)
+                                                                                              .overWindow(window));
+                ColumnVector expect = ColumnVector.fromBoxedInts(2, 3, 4, 3, 2, 3, 4, 3, 2, 3, 4, 3)) {
+              assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+            }
+          }
+
+          // Test Window functionality with range window (UNBOUNDED PRECEDING and CURRENT ROW)
+          try (Scalar current_row = getDecimalScalarRangeBounds(0, 0, decimal_oby_col_idx);
+               WindowOptions window = WindowOptions.builder()
+                .minPeriods(1)
+                .unboundedPreceding()
+                .following(current_row)
+                .orderByColumnIndex(decimal_oby_col_idx)
+                .build()) {
+
+            try (Table windowAggResults = sorted.groupBy(0, 1)
+                                                .aggregateWindowsOverRanges(RollingAggregation.count()
+                                                                                              .onColumn(5)
+                                                                                              .overWindow(window));
+                ColumnVector expect = ColumnVector.fromBoxedInts(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4)) {
+              assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+            }
+          }
+
+          // Test Window functionality with range window (UNBOUNDED PRECEDING and UNBOUNDED FOLLOWING)
+          try (WindowOptions window = WindowOptions.builder()
+                .minPeriods(1)
+                .unboundedPreceding()
+                .unboundedFollowing()
+                .orderByColumnIndex(decimal_oby_col_idx)
+                .build()) {
+
+            try (Table windowAggResults = sorted.groupBy(0, 1)
+                                                .aggregateWindowsOverRanges(RollingAggregation.count()
+                                                                                              .onColumn(5)
+                                                                                              .overWindow(window));
+                ColumnVector expect = ColumnVector.fromBoxedInts(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4)) {
+              assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+            }
+          }
+        }
+      }
+    }
+  }
+
   @Test
   void testGroupByCountWithNulls() {
     try (Table t1 = new Table.TestBuilder().column(null, null,    1,    1,    1,    1)

From 0032a7c23f0154050bb314be09c52b0e05339e7c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 13 Sep 2022 15:52:16 -0700
Subject: [PATCH 20/25] Fix compile error due to missing header (#11697)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The recently merged PR (https://github.com/rapidsai/cudf/pull/11551) did not include the `<optional>` header which may cause compile error in some systems (in particular, CUDA 11.7 + gcc-11.2):
```
error: ‘std::optional’ has not been declared
error: ‘optional’ in namespace ‘std’ does not name a template type
```

This PR adds that missing header to fix the compile issue.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/11697
---
 cpp/src/io/comp/nvcomp_adapter.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 41af564ca76..a13cb031163 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -23,6 +23,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <optional>
+
 namespace cudf::io::nvcomp {
 
 enum class compression_type { SNAPPY, ZSTD, DEFLATE };

From 18bfbe78ad4ec55aa688aaf63916e4e0a5054737 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 14 Sep 2022 10:25:42 -0500
Subject: [PATCH 21/25] Fix `DataFrame.from_arrow` to preserve type metadata
 (#11698)

Fixes: #11693
This PR fixes `DataFrame.from_arrow` which does not preserve type metadata for `struct`, `list` & `decimal` types.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11698
---
 python/cudf/cudf/core/frame.py           | 79 ++++++++++++------------
 python/cudf/cudf/tests/test_dataframe.py | 20 ++++++
 2 files changed, 61 insertions(+), 38 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index ec78a8a37cf..c4aa6f4663c 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1012,8 +1012,8 @@ def from_arrow(cls, data):
             )
 
         column_names = data.column_names
-        pandas_dtypes = None
-        np_dtypes = None
+        pandas_dtypes = {}
+        np_dtypes = {}
         if isinstance(data.schema.pandas_metadata, dict):
             metadata = data.schema.pandas_metadata
             pandas_dtypes = {
@@ -1085,42 +1085,45 @@ def from_arrow(cls, data):
 
         # There are some special cases that need to be handled
         # based on metadata.
-        if pandas_dtypes:
-            for name in result:
-                dtype = None
-                if (
-                    len(result[name]) == 0
-                    and pandas_dtypes[name] == "categorical"
-                ):
-                    # When pandas_dtype is a categorical column and the size
-                    # of column is 0(i.e., empty) then we will have an
-                    # int8 column in result._data[name] returned by libcudf,
-                    # which needs to be type-casted to 'category' dtype.
-                    dtype = "category"
-                elif (
-                    pandas_dtypes[name] == "empty"
-                    and np_dtypes[name] == "object"
-                ):
-                    # When a string column has all null values, pandas_dtype is
-                    # is specified as 'empty' and np_dtypes as 'object',
-                    # hence handling this special case to type-cast the empty
-                    # float column to str column.
-                    dtype = np_dtypes[name]
-                elif pandas_dtypes[
-                    name
-                ] == "object" and cudf.api.types.is_struct_dtype(
-                    np_dtypes[name]
-                ):
-                    # Incase of struct column, libcudf is not aware of names of
-                    # struct fields, hence renaming the struct fields is
-                    # necessary by extracting the field names from arrow
-                    # struct types.
-                    result[name] = result[name]._rename_fields(
-                        [field.name for field in data[name].type]
-                    )
-
-                if dtype is not None:
-                    result[name] = result[name].astype(dtype)
+        for name in result:
+            if (
+                len(result[name]) == 0
+                and pandas_dtypes.get(name) == "categorical"
+            ):
+                # When pandas_dtype is a categorical column and the size
+                # of column is 0 (i.e., empty) then we will have an
+                # int8 column in result._data[name] returned by libcudf,
+                # which needs to be type-casted to 'category' dtype.
+                result[name] = result[name].as_categorical_column("category")
+            elif (
+                pandas_dtypes.get(name) == "empty"
+                and np_dtypes.get(name) == "object"
+            ):
+                # When a string column has all null values, pandas_dtype is
+                # is specified as 'empty' and np_dtypes as 'object',
+                # hence handling this special case to type-cast the empty
+                # float column to str column.
+                result[name] = result[name].as_string_column(cudf.dtype("str"))
+            elif name in data.column_names and isinstance(
+                data[name].type,
+                (pa.StructType, pa.ListType, pa.Decimal128Type),
+            ):
+                # In case of struct column, libcudf is not aware of names of
+                # struct fields, hence renaming the struct fields is
+                # necessary by extracting the field names from arrow
+                # struct types.
+
+                # In case of decimal column, libcudf is not aware of the
+                # decimal precision.
+
+                # In case of list column, there is a possibility of nested
+                # list columns to have struct or decimal columns inside them.
+
+                # All of these cases are handled by calling the
+                # _with_type_metadata method on the column.
+                result[name] = result[name]._with_type_metadata(
+                    cudf.utils.dtypes.cudf_dtype_from_pa_type(data[name].type)
+                )
 
         return cls._from_data({name: result[name] for name in column_names})
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index af719958c1a..fbc4fd619a1 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9549,3 +9549,23 @@ def test_non_string_column_name_to_arrow(data):
     actual = pa.Table.from_pandas(df.to_pandas())
 
     assert expected.equals(actual)
+
+
+def test_complex_types_from_arrow():
+
+    expected = pa.Table.from_arrays(
+        [
+            pa.array([1, 2, 3]),
+            pa.array([10, 20, 30]),
+            pa.array([{"a": 9}, {"b": 10}, {"c": 11}]),
+            pa.array([[{"a": 1}], [{"b": 2}], [{"c": 3}]]),
+            pa.array([10, 11, 12]).cast(pa.decimal128(21, 2)),
+            pa.array([{"a": 9}, {"b": 10, "c": {"g": 43}}, {"c": {"a": 10}}]),
+        ],
+        names=["a", "b", "c", "d", "e", "f"],
+    )
+
+    df = cudf.DataFrame.from_arrow(expected)
+    actual = df.to_arrow()
+
+    assert expected.equals(actual)

From d1d879e3b64ec894b0d6553be0e998347ef458e4 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 14 Sep 2022 21:54:26 +0100
Subject: [PATCH 22/25] Drop split_out=None test from groupby.agg (#11704)

Discussion in dask/dask#9490 and dask/dask#9491 suggests that split_out=None as a default value was never really intended and is likely to be deprecated.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ashwin Srinath (https://github.com/shwina)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/11704
---
 .../dask_cudf/dask_cudf/tests/test_groupby.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index a8e9e8e92aa..aa90edd34a8 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -584,7 +584,7 @@ def test_groupby_categorical_key():
 
 
 @pytest.mark.parametrize("as_index", [True, False])
-@pytest.mark.parametrize("split_out", [None, 1, 2])
+@pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2])
 @pytest.mark.parametrize("split_every", [False, 4])
 @pytest.mark.parametrize("npartitions", [1, 10])
 def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
@@ -602,14 +602,17 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
         "c": ["mean", "std", "var"],
     }
 
+    split_kwargs = {"split_every": split_every, "split_out": split_out}
+    if split_out == "use_dask_default":
+        split_kwargs.pop("split_out")
+
     # Check `sort=True` behavior
     if split_out == 1:
         gf = (
             ddf.groupby(["name", "a"], sort=True, as_index=as_index)
             .aggregate(
                 agg_dict,
-                split_every=split_every,
-                split_out=split_out,
+                **split_kwargs,
             )
             .compute()
         )
@@ -630,13 +633,11 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     # Full check (`sort=False`)
     gr = ddf.groupby(["name", "a"], sort=False, as_index=as_index).aggregate(
         agg_dict,
-        split_every=split_every,
-        split_out=split_out,
+        **split_kwargs,
     )
     pr = pddf.groupby(["name", "a"], sort=False).agg(
         agg_dict,
-        split_every=split_every,
-        split_out=split_out,
+        **split_kwargs,
     )
 
     # Test `as_index` argument
@@ -648,7 +649,9 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
         assert ("name", "") in gr.columns and ("a", "") in gr.columns
 
     # Check `split_out` argument
-    assert gr.npartitions == (split_out or 1)
+    assert gr.npartitions == (
+        1 if split_out == "use_dask_default" else split_out
+    )
 
     # Compute for easier multiindex handling
     gf = gr.compute()

From 66f6960231265d582d9246d06e2ac0ec52bcae86 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 14 Sep 2022 15:58:45 -0700
Subject: [PATCH 23/25] Modify ORC reader timestamp parsing to match the apache
 reader behavior (#11699)

Closes #11525
Not sure why, but the apache Java ORC reader does the following when reading negative timestamps: https://github.com/apache/orc/blob/main/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java#L1284-L1285
This detail does not impact cuDF and pyorc writers (reading cudf files with apache reader already works) because these libraries write negative timestamps with negative nanoseconds.

This PR modifies the ORC reader behavior to match the apache reader so that cuDF correctly reads ORC files written by the apache reader.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Elias Stehle (https://github.com/elstehle)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/11699
---
 cpp/src/io/orc/stripe_data.cu                     |   3 ++-
 .../data/orc/TestOrcFile.apache_timestamp.orc     | Bin 0 -> 302 bytes
 python/cudf/cudf/tests/test_orc.py                |   9 +++++++++
 3 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.apache_timestamp.orc

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 4fa407f4e88..c9cc0f04b3c 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1770,7 +1770,8 @@ __global__ void __launch_bounds__(block_size)
               // Adjust seconds only for negative timestamps with positive nanoseconds.
               // Alternative way to represent negative timestamps is with negative nanoseconds
               // in which case the adjustment in not needed.
-              if (seconds < 0 && nanos > 0) { seconds -= 1; }
+              // Comparing with 999999 instead of zero to match the apache writer.
+              if (seconds < 0 and nanos > 999999) { seconds -= 1; }
 
               duration_ns d_ns{nanos};
               duration_s d_s{seconds};
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.apache_timestamp.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.apache_timestamp.orc
new file mode 100644
index 0000000000000000000000000000000000000000..dd51856c3f7c4f8410b63b111ab84d159f028f65
GIT binary patch
literal 302
zcmeYdau#G@;9?VE;b074a0N0IxY!uLKuC;((Mv$${MGx9&s<<pfYJdB{0s~g46JGI
z#2<1pFvKwkf2sx<z`-WKAi)i$86{MKG@}5c1Q(EI1=8$7OdJeA%p@i3n44OZnVhKa
zlV2R~n3tZKlUfY173=^m3G8-82wq`e$o>$)!7R|iD8&V&3?u{<95fh}I5>EO7#XD)
z%aoWnIJFeHB=U>W^%4sblQUBFiVG5pvh~VRi;6Sz^MqK9^$hil3>dAzHd&$C#LJ-(
nzz`wmFu~7*A%c-3SAa>Pp+P{2iGfE$)quU}3p2BSkh3@dM@~hL

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 18d159bc423..e7ca50f5e92 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1871,3 +1871,12 @@ def test_orc_writer_negative_timestamp(negative_timestamp_df):
 
     assert_eq(negative_timestamp_df, pd.read_orc(buffer))
     assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read())
+
+
+def test_orc_reader_apache_negative_timestamp(datadir):
+    path = datadir / "TestOrcFile.apache_timestamp.orc"
+
+    pdf = pd.read_orc(path)
+    gdf = cudf.read_orc(path)
+
+    assert_eq(pdf, gdf)

From 9c3afc3c64fef9033cd7b9122e086ea8b05c9850 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <tribizel@nvidia.com>
Date: Thu, 15 Sep 2022 01:59:37 +0200
Subject: [PATCH 24/25] Special-case multibyte_split for single-byte delimiter
 (#11681)

multibyte_split does two scans to determine delimiters: One using the trie to determine matches, one to determine the offsets. For single-byte delimiters, the trie scan is unnecessary, since we can determine without context what is a delimiter. So I added a single `byte_split_kernel` by replacing the trie logic with a char comparison. The difference is quite significant:

| source_type | delim_size | delim_percent |    size_approx    |  GPU Time  | Peak Memory Usage | Encoded file size |
|-------------|------------|---------------|-------------------|------------|-------------------|-------------------|
|           0 |          1 |             1 | 2^30 = 1073741824 | 110.196 ms |         3.947 GiB |      1006.638 MiB |
|           0 |          2 |             1 | 2^30 = 1073741824 | 198.067 ms |         3.745 GiB |      1011.775 MiB |
|           0 |          1 |            25 | 2^30 = 1073741824 | 122.626 ms |         9.978 GiB |       762.462 MiB |
|           0 |          2 |            25 | 2^30 = 1073741824 | 224.000 ms |         6.163 GiB |       889.541 MiB |

This might point to the fact that the custom prefix scan implementation in multibyte_split is a bottleneck, but this needs more investigation.

Authors:
  - Tobias Ribizel (https://github.com/upsj)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/11681
---
 cpp/src/io/text/multibyte_split.cu         | 109 ++++++++++++++++++---
 cpp/tests/io/text/multibyte_split_test.cpp |  55 ++++++++++-
 2 files changed, 149 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 5313d7a89ba..133c5fe9826 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -302,6 +302,73 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
   }
 }
 
+__global__ __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
+  cudf::size_type base_tile_idx,
+  int64_t base_input_offset,
+  int64_t base_offset_offset,
+  cudf::io::text::detail::scan_tile_state_view<cutoff_offset> tile_output_offsets,
+  char delim,
+  cudf::device_span<char const> chunk_input_chars,
+  int64_t byte_range_end,
+  cudf::split_device_span<int64_t> output_offsets)
+{
+  using InputLoad =
+    cub::BlockLoad<char, THREADS_PER_TILE, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+  using OffsetScan         = cub::BlockScan<cutoff_offset, THREADS_PER_TILE>;
+  using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback<cutoff_offset>;
+
+  __shared__ union {
+    typename InputLoad::TempStorage input_load;
+    typename OffsetScan::TempStorage offset_scan;
+  } temp_storage;
+
+  int32_t const tile_idx            = base_tile_idx + blockIdx.x;
+  int32_t const tile_input_offset   = blockIdx.x * ITEMS_PER_TILE;
+  int32_t const thread_input_offset = tile_input_offset + threadIdx.x * ITEMS_PER_THREAD;
+  int32_t const thread_input_size   = chunk_input_chars.size() - thread_input_offset;
+
+  // STEP 1: Load inputs
+
+  char thread_chars[ITEMS_PER_THREAD];
+
+  InputLoad(temp_storage.input_load)
+    .Load(chunk_input_chars.data() + tile_input_offset,
+          thread_chars,
+          chunk_input_chars.size() - tile_input_offset);
+
+  // STEP 2: Flag matches
+
+  cutoff_offset thread_offset;
+  uint32_t thread_match_mask[(ITEMS_PER_THREAD + 31) / 32]{};
+
+  for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) {
+    auto const is_match      = i < thread_input_size and thread_chars[i] == delim;
+    auto const match_end     = base_input_offset + thread_input_offset + i + 1;
+    auto const is_past_range = match_end >= byte_range_end;
+    thread_match_mask[i / 32] |= uint32_t{is_match} << (i % 32);
+    thread_offset = thread_offset + cutoff_offset{is_match, is_past_range};
+  }
+
+  // STEP 3: Scan flags to determine absolute thread output offset
+
+  auto prefix_callback = OffsetScanCallback(tile_output_offsets, tile_idx);
+
+  __syncthreads();  // required before temp_memory re-use
+  OffsetScan(temp_storage.offset_scan).ExclusiveSum(thread_offset, thread_offset, prefix_callback);
+
+  // Step 4: Assign outputs from each thread using match offsets.
+
+  for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) {
+    auto const is_match = (thread_match_mask[i / 32] >> (i % 32)) & 1u;
+    if (is_match && !thread_offset.is_past_end()) {
+      auto const match_end     = base_input_offset + thread_input_offset + i + 1;
+      auto const is_past_range = match_end >= byte_range_end;
+      output_offsets[thread_offset.offset() - base_offset_offset] = match_end;
+      thread_offset = thread_offset + cutoff_offset{true, is_past_range};
+    }
+  }
+}
+
 }  // namespace
 
 namespace cudf {
@@ -615,19 +682,35 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 
     cudaStreamWaitEvent(scan_stream.value(), last_launch_event);
 
-    multibyte_split_kernel<<<tiles_in_launch,
-                             THREADS_PER_TILE,
-                             0,
-                             scan_stream.value()>>>(  //
-      base_tile_idx,
-      chunk_offset,
-      offset_storage.size(),
-      tile_multistates,
-      tile_offsets,
-      {device_delim.data(), static_cast<std::size_t>(device_delim.size())},
-      *chunk,
-      byte_range_end,
-      offset_output);
+    if (delimiter.size() == 1) {
+      // the single-byte case allows for a much more efficient kernel, so we special-case it
+      byte_split_kernel<<<tiles_in_launch,
+                          THREADS_PER_TILE,
+                          0,
+                          scan_stream.value()>>>(  //
+        base_tile_idx,
+        chunk_offset,
+        offset_storage.size(),
+        tile_offsets,
+        delimiter[0],
+        *chunk,
+        byte_range_end,
+        offset_output);
+    } else {
+      multibyte_split_kernel<<<tiles_in_launch,
+                               THREADS_PER_TILE,
+                               0,
+                               scan_stream.value()>>>(  //
+        base_tile_idx,
+        chunk_offset,
+        offset_storage.size(),
+        tile_multistates,
+        tile_offsets,
+        {device_delim.data(), static_cast<std::size_t>(device_delim.size())},
+        *chunk,
+        byte_range_end,
+        offset_output);
+    }
 
     // load the next chunk
     auto next_chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 8ec88696355..43debf3d5b3 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -197,7 +197,7 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRange)
   auto host_input    = std::string();
   auto host_expected = std::vector<std::string>();
 
-  for (auto i = 0; i < 1000; i++) {
+  for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) {
     host_input += "...:|";
   }
 
@@ -222,7 +222,7 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRange)
   auto host_input    = std::string();
   auto host_expected = std::vector<std::string>();
 
-  for (auto i = 0; i < 1000; i++) {
+  for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) {
     host_input += ".....";
   }
 
@@ -244,6 +244,57 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS);
 }
 
+TEST_F(MultibyteSplitTest, LargeInputMultipleRangeSingleByte)
+{
+  auto host_input    = std::string();
+  auto host_expected = std::vector<std::string>();
+
+  for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) {
+    host_input += "...:|";
+  }
+
+  auto delimiter = std::string("|");
+  auto source    = cudf::io::text::make_source(host_input);
+
+  auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
+  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
+  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
+  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+
+  auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
+  auto out       = cudf::concatenate(out_views);
+
+  auto expected = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS);
+}
+
+TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRangeSingleByte)
+{
+  auto host_input    = std::string();
+  auto host_expected = std::vector<std::string>();
+
+  for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) {
+    host_input += ".....";
+  }
+
+  auto delimiter                    = std::string("|");
+  host_input[host_input.size() / 2] = '|';
+  auto source                       = cudf::io::text::make_source(host_input);
+
+  auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
+  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
+  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
+  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+
+  auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
+  auto out       = cudf::concatenate(out_views);
+
+  auto expected = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS);
+}
+
 TEST_F(MultibyteSplitTest, SmallInputAllPossibleRanges)
 {
   using namespace cudf::io::text;

From 75d126ac84bb6087bf4cc4953fff113b48d4c023 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 15 Sep 2022 01:18:03 -0400
Subject: [PATCH 25/25] Add generic type inference for cuIO (#11121)

This PR adds the type inference component for the nested JSON work.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)
  - Elias Stehle (https://github.com/elstehle)

Approvers:
  - Elias Stehle (https://github.com/elstehle)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/11121
---
 cpp/src/io/fst/agent_dfa.cuh                  |   2 +-
 .../io/utilities/column_type_histogram.hpp    |  18 +-
 cpp/src/io/utilities/parsing_utils.cuh        |  18 +
 cpp/src/io/utilities/type_inference.cuh       | 315 ++++++++++++++++++
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/io/type_inference_test.cu           | 265 +++++++++++++++
 6 files changed, 609 insertions(+), 10 deletions(-)
 create mode 100644 cpp/src/io/utilities/type_inference.cuh
 create mode 100644 cpp/tests/io/type_inference_test.cu

diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index d847598d6dd..e02c7ff85fa 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -422,7 +422,7 @@ struct AgentDFA {
     OffsetT const num_total_symbols,
     StateIndexT& state,
     CallbackOpT& callback_op,
-    cub::Int2Type<BYPASS_LOAD> /**/)
+    cub::Int2Type<BYPASS_LOAD>)
   {
     using StateTransitionOpT = StateTransitionOp<CallbackOpT, TransitionTableT>;
 
diff --git a/cpp/src/io/utilities/column_type_histogram.hpp b/cpp/src/io/utilities/column_type_histogram.hpp
index 99762595693..8bd2d3a89cf 100644
--- a/cpp/src/io/utilities/column_type_histogram.hpp
+++ b/cpp/src/io/utilities/column_type_histogram.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,14 +25,14 @@ namespace io {
  * @brief Per-column histogram struct containing detected occurrences of each dtype
  */
 struct column_type_histogram {
-  cudf::size_type null_count;
-  cudf::size_type float_count;
-  cudf::size_type datetime_count;
-  cudf::size_type string_count;
-  cudf::size_type negative_small_int_count;
-  cudf::size_type positive_small_int_count;
-  cudf::size_type big_int_count;
-  cudf::size_type bool_count;
+  cudf::size_type null_count{};
+  cudf::size_type float_count{};
+  cudf::size_type datetime_count{};
+  cudf::size_type string_count{};
+  cudf::size_type negative_small_int_count{};
+  cudf::size_type positive_small_int_count{};
+  cudf::size_type big_int_count{};
+  cudf::size_type bool_count{};
 };
 
 }  // namespace io
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index a3699acb934..118fde6fdb6 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -42,6 +42,16 @@ using cudf::device_span;
 namespace cudf {
 namespace io {
 
+/**
+ * @brief Non-owning view for json type inference options
+ */
+struct json_inference_options_view {
+  char quote_char;
+  cudf::detail::trie_view trie_true;
+  cudf::detail::trie_view trie_false;
+  cudf::detail::trie_view trie_na;
+};
+
 /**
  * @brief Structure for holding various options used when parsing and
  * converting CSV/json data to cuDF data type values.
@@ -79,6 +89,14 @@ struct parse_options {
   cudf::detail::optional_trie trie_na;
   bool multi_delimiter;
 
+  [[nodiscard]] json_inference_options_view json_view() const
+  {
+    return {quotechar,
+            cudf::detail::make_trie_view(trie_true),
+            cudf::detail::make_trie_view(trie_false),
+            cudf::detail::make_trie_view(trie_na)};
+  }
+
   [[nodiscard]] parse_options_view view() const
   {
     return {delimiter,
diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh
new file mode 100644
index 00000000000..578c72fc316
--- /dev/null
+++ b/cpp/src/io/utilities/type_inference.cuh
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <io/utilities/column_type_histogram.hpp>
+#include <io/utilities/parsing_utils.cuh>
+#include <io/utilities/trie.cuh>
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+
+#include <thrust/distance.h>
+#include <thrust/tuple.h>
+
+#include <cub/block/block_reduce.cuh>
+
+#include <cstddef>
+
+namespace cudf::io::detail {
+/**
+ * @brief Custom column_type_histogram sum reduction callable
+ */
+struct custom_sum {
+  __device__ inline cudf::io::column_type_histogram operator()(
+    cudf::io::column_type_histogram const& lhs, cudf::io::column_type_histogram const& rhs)
+  {
+    return {lhs.null_count + rhs.null_count,
+            lhs.float_count + rhs.float_count,
+            lhs.datetime_count + rhs.datetime_count,
+            lhs.string_count + rhs.string_count,
+            lhs.negative_small_int_count + rhs.negative_small_int_count,
+            lhs.positive_small_int_count + rhs.positive_small_int_count,
+            lhs.big_int_count + rhs.big_int_count,
+            lhs.bool_count + rhs.bool_count};
+  }
+};
+
+/**
+ * @brief Returns true if the input character is a valid digit.
+ * Supports both decimal and hexadecimal digits (uppercase and lowercase).
+ *
+ * @param c Character to check
+ * @param is_hex Whether to check as a hexadecimal
+ *
+ * @return `true` if it is digit-like, `false` otherwise
+ */
+__device__ __inline__ bool is_digit(char const c, bool const is_hex = false)
+{
+  if (c >= '0' && c <= '9') return true;
+
+  if (is_hex) {
+    if (c >= 'A' && c <= 'F') return true;
+    if (c >= 'a' && c <= 'f') return true;
+  }
+
+  return false;
+}
+
+/**
+ * @brief Returns true if the counters indicate a potentially valid float.
+ * False positives are possible because positions are not taken into account.
+ * For example, field "e.123-" would match the pattern.
+ */
+__device__ __inline__ bool is_like_float(std::size_t len,
+                                         uint32_t digit_cnt,
+                                         uint32_t decimal_cnt,
+                                         uint32_t dash_cnt,
+                                         uint32_t exponent_cnt)
+{
+  // Can't have more than one exponent and one decimal point
+  if (decimal_cnt > 1) return false;
+  if (exponent_cnt > 1) return false;
+  // Without the exponent or a decimal point, this is an integer, not a float
+  if (decimal_cnt == 0 && exponent_cnt == 0) return false;
+
+  // Can only have one '-' per component
+  if (dash_cnt > 1 + exponent_cnt) return false;
+
+  // If anything other than these characters is present, it's not a float
+  if (digit_cnt + decimal_cnt + dash_cnt + exponent_cnt != len) return false;
+
+  // Needs at least 1 digit, 2 if exponent is present
+  if (digit_cnt < 1 + exponent_cnt) return false;
+
+  return true;
+}
+
+/**
+ * @brief Constructs column type histogram for a given column string input `data`.
+ *
+ * @tparam BlockSize Number of threads in each block
+ * @tparam OptionsView Type of inference options view
+ * @tparam ColumnStringIter Iterator type whose `value_type` is a
+ * `thrust::tuple<offset_t, length_t>`, where `offset_t` and `length_t` are of integral type and
+ * `offset_t` needs to be convertible to `std::size_t`.
+ *
+ * @param[in] options View of inference options
+ * @param[in] data JSON string input
+ * @param[in] column_strings_begin The begining of an offset-length tuple sequence
+ * @param[in] size Size of the string input
+ * @param[out] column_info Histogram of column type counters
+ */
+template <int BlockSize, typename OptionsView, typename ColumnStringIter>
+__global__ void infer_column_type_kernel(OptionsView options,
+                                         device_span<char const> data,
+                                         ColumnStringIter column_strings_begin,
+                                         std::size_t size,
+                                         cudf::io::column_type_histogram* column_info)
+{
+  auto thread_type_histogram = cudf::io::column_type_histogram{};
+
+  for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    auto const field_offset = thrust::get<0>(*(column_strings_begin + idx));
+    auto const field_len    = thrust::get<1>(*(column_strings_begin + idx));
+    auto const field_begin  = data.begin() + field_offset;
+
+    if (cudf::detail::serialized_trie_contains(
+          options.trie_na, {field_begin, static_cast<std::size_t>(field_len)})) {
+      ++thread_type_histogram.null_count;
+      continue;
+    }
+
+    // Handling strings
+    if (field_len >= 2 and *field_begin == options.quote_char and
+        field_begin[field_len - 1] == options.quote_char) {
+      ++thread_type_histogram.string_count;
+      continue;
+    }
+
+    uint32_t digit_count    = 0;
+    uint32_t decimal_count  = 0;
+    uint32_t slash_count    = 0;
+    uint32_t dash_count     = 0;
+    uint32_t plus_count     = 0;
+    uint32_t colon_count    = 0;
+    uint32_t exponent_count = 0;
+    uint32_t other_count    = 0;
+
+    auto const maybe_hex =
+      (field_len > 2 && field_begin[0] == '0' && field_begin[1] == 'x') ||
+      (field_len > 3 && field_begin[0] == '-' && field_begin[1] == '0' && field_begin[2] == 'x');
+    auto const field_end = field_begin + field_len;
+
+    for (auto pos = field_begin; pos < field_end; ++pos) {
+      if (is_digit(*pos, maybe_hex)) {
+        digit_count++;
+        continue;
+      }
+      // Looking for unique characters that will help identify column types
+      switch (*pos) {
+        case '.': decimal_count++; break;
+        case '-': dash_count++; break;
+        case '+': plus_count++; break;
+        case '/': slash_count++; break;
+        case ':': colon_count++; break;
+        case 'e':
+        case 'E':
+          if (!maybe_hex && pos > field_begin && pos < field_end - 1) exponent_count++;
+          break;
+        default: other_count++; break;
+      }
+    }
+
+    // All characters must be digits in an integer, except for the starting sign and 'x' in the
+    // hexadecimal prefix
+    auto const int_req_number_cnt =
+      static_cast<uint32_t>(field_len) -
+      ((*field_begin == '-' || *field_begin == '+') && field_len > 1) - maybe_hex;
+    if (cudf::detail::serialized_trie_contains(
+          options.trie_true, {field_begin, static_cast<std::size_t>(field_len)}) ||
+        cudf::detail::serialized_trie_contains(
+          options.trie_false, {field_begin, static_cast<std::size_t>(field_len)})) {
+      ++thread_type_histogram.bool_count;
+    } else if (digit_count == int_req_number_cnt) {
+      auto const is_negative = (*field_begin == '-');
+      char const* data_begin = field_begin + (is_negative || (*field_begin == '+'));
+      cudf::size_type* ptr   = cudf::io::gpu::infer_integral_field_counter(
+        data_begin, data_begin + digit_count, is_negative, thread_type_histogram);
+      ++*ptr;
+    } else if (is_like_float(
+                 field_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) {
+      ++thread_type_histogram.float_count;
+    }
+    // All invalid JSON values are treated as string
+    else {
+      ++thread_type_histogram.string_count;
+    }
+  }  // grid-stride for loop
+
+  using BlockReduce = cub::BlockReduce<cudf::io::column_type_histogram, BlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  auto const block_type_histogram =
+    BlockReduce(temp_storage).Reduce(thread_type_histogram, custom_sum{});
+  if (threadIdx.x == 0) {
+    atomicAdd(&column_info->null_count, block_type_histogram.null_count);
+    atomicAdd(&column_info->float_count, block_type_histogram.float_count);
+    atomicAdd(&column_info->datetime_count, block_type_histogram.datetime_count);
+    atomicAdd(&column_info->string_count, block_type_histogram.string_count);
+    atomicAdd(&column_info->negative_small_int_count,
+              block_type_histogram.negative_small_int_count);
+    atomicAdd(&column_info->positive_small_int_count,
+              block_type_histogram.positive_small_int_count);
+    atomicAdd(&column_info->big_int_count, block_type_histogram.big_int_count);
+    atomicAdd(&column_info->bool_count, block_type_histogram.bool_count);
+  }
+}
+
+/**
+ * @brief Constructs column type histogram for a given column string input `data`.
+ *
+ * @tparam OptionsView Type of inference options view
+ * @tparam ColumnStringIter Iterator type whose `value_type` is a
+ * `thrust::tuple<offset_t, length_t>`, where `offset_t` and `length_t` are of integral type and
+ * `offset_t` needs to be convertible to `std::size_t`.
+ *
+ * @param options View of inference options
+ * @param data JSON string input
+ * @param column_strings_begin The begining of an offset-length tuple sequence
+ * @param size Size of the string input
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A histogram containing column-specific type counters
+ */
+template <typename OptionsView, typename ColumnStringIter>
+cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
+                                                  cudf::device_span<char const> data,
+                                                  ColumnStringIter column_strings_begin,
+                                                  std::size_t const size,
+                                                  rmm::cuda_stream_view stream)
+{
+  constexpr int block_size = 128;
+
+  auto const grid_size = (size + block_size - 1) / block_size;
+  auto d_column_info   = rmm::device_scalar<cudf::io::column_type_histogram>(stream);
+  CUDF_CUDA_TRY(cudaMemsetAsync(
+    d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value()));
+
+  infer_column_type_kernel<block_size><<<grid_size, block_size, 0, stream.value()>>>(
+    options, data, column_strings_begin, size, d_column_info.data());
+
+  return d_column_info.value(stream);
+}
+
+/**
+ * @brief Infers data type for a given JSON string input `data`.
+ *
+ * @throw cudf::logic_error if input size is 0
+ * @throw cudf::logic_error if date time is not inferred as string
+ * @throw cudf::logic_error if data type inference failed
+ *
+ * @tparam OptionsView Type of inference options view
+ * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to
+ * `thrust::tuple<device_span, string_view>`
+ *
+ * @param options View of inference options
+ * @param data JSON string input
+ * @param column_strings_begin The begining of an offset-length tuple sequence
+ * @param size Size of the string input
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The inferred data type
+ */
+template <typename OptionsView, typename ColumnStringIter>
+cudf::data_type infer_data_type(OptionsView const& options,
+                                device_span<char const> data,
+                                ColumnStringIter column_strings_begin,
+                                std::size_t const size,
+                                rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(size != 0, "No data available for data type inference.\n");
+
+  auto const h_column_info = infer_column_type(options, data, column_strings_begin, size, stream);
+
+  auto get_type_id = [&](auto const& cinfo) {
+    auto int_count_total =
+      cinfo.big_int_count + cinfo.negative_small_int_count + cinfo.positive_small_int_count;
+    if (cinfo.null_count == static_cast<cudf::size_type>(size)) {
+      // Entire column is NULL; allocate the smallest amount of memory
+      return type_id::INT8;
+    } else if (cinfo.string_count > 0) {
+      return type_id::STRING;
+    } else if (cinfo.datetime_count > 0) {
+      CUDF_FAIL("Date time is inferred as string.\n");
+    } else if (cinfo.float_count > 0 || (int_count_total > 0 && cinfo.null_count > 0)) {
+      return type_id::FLOAT64;
+    } else if (cinfo.big_int_count == 0 && int_count_total != 0) {
+      return type_id::INT64;
+    } else if (cinfo.big_int_count != 0 && cinfo.negative_small_int_count != 0) {
+      return type_id::STRING;
+    } else if (cinfo.big_int_count != 0) {
+      return type_id::UINT64;
+    } else if (cinfo.bool_count > 0) {
+      return type_id::BOOL8;
+    }
+    CUDF_FAIL("Data type inference failed.\n");
+  };
+  return cudf::data_type{get_type_id(h_column_info)};
+}
+}  // namespace cudf::io::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3710bc6cdfa..b31d6d30381 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -232,6 +232,7 @@ ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
 ConfigureTest(DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp)
 ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)
 ConfigureTest(FST_TEST io/fst/fst_test.cu)
+ConfigureTest(TYPE_INFERENCE_TEST io/type_inference_test.cu)
 if(CUDF_ENABLE_ARROW_S3)
   target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED")
 endif()
diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu
new file mode 100644
index 00000000000..8ba66b6369b
--- /dev/null
+++ b/cpp/tests/io/type_inference_test.cu
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <io/utilities/trie.cuh>
+#include <io/utilities/type_inference.cuh>
+
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf_test/base_fixture.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+using cudf::io::parse_options;
+using cudf::io::detail::infer_data_type;
+
+// Base test fixture for tests
+struct TypeInference : public cudf::test::BaseFixture {
+};
+
+TEST_F(TypeInference, Basic)
+{
+  auto const stream = rmm::cuda_stream_default;
+
+  auto options       = parse_options{',', '\n', '\"'};
+  options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
+  options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  options.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+
+  std::string data      = R"json([42,52,5])json";
+  auto d_data           = cudf::make_string_scalar(data);
+  auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
+
+  std::size_t constexpr size = 3;
+  auto const string_offset   = std::vector<int32_t>{1, 4, 7};
+  auto const string_length   = std::vector<std::size_t>{2, 2, 1};
+  rmm::device_vector<int32_t> d_string_offset{string_offset};
+  rmm::device_vector<std::size_t> d_string_length{string_length};
+
+  auto d_col_strings =
+    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+
+  auto res_type =
+    infer_data_type(options.json_view(),
+                    {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
+                    d_col_strings,
+                    size,
+                    stream);
+
+  EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT64});
+}
+
+TEST_F(TypeInference, Null)
+{
+  auto const stream = rmm::cuda_stream_default;
+
+  auto options       = parse_options{',', '\n', '\"'};
+  options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
+  options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  options.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+
+  std::string data      = R"json([52,5])json";
+  auto d_data           = cudf::make_string_scalar(data);
+  auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
+
+  std::size_t constexpr size = 3;
+  auto const string_offset   = std::vector<int32_t>{1, 1, 4};
+  auto const string_length   = std::vector<std::size_t>{0, 2, 1};
+  rmm::device_vector<int32_t> d_string_offset{string_offset};
+  rmm::device_vector<std::size_t> d_string_length{string_length};
+
+  auto d_col_strings =
+    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+
+  auto res_type =
+    infer_data_type(options.json_view(),
+                    {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
+                    d_col_strings,
+                    size,
+                    stream);
+
+  EXPECT_EQ(res_type,
+            cudf::data_type{cudf::type_id::FLOAT64});  // FLOAT64 to align with pandas's behavior
+}
+
+TEST_F(TypeInference, AllNull)
+{
+  auto const stream = rmm::cuda_stream_default;
+
+  auto options       = parse_options{',', '\n', '\"'};
+  options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
+  options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  options.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+
+  std::string data      = R"json([null])json";
+  auto d_data           = cudf::make_string_scalar(data);
+  auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
+
+  std::size_t constexpr size = 3;
+  auto const string_offset   = std::vector<int32_t>{1, 1, 1};
+  auto const string_length   = std::vector<std::size_t>{0, 0, 4};
+  rmm::device_vector<int32_t> d_string_offset{string_offset};
+  rmm::device_vector<std::size_t> d_string_length{string_length};
+
+  auto d_col_strings =
+    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+
+  auto res_type =
+    infer_data_type(options.json_view(),
+                    {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
+                    d_col_strings,
+                    size,
+                    stream);
+
+  EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT8});  // INT8 if all nulls
+}
+
+TEST_F(TypeInference, String)
+{
+  auto const stream = rmm::cuda_stream_default;
+
+  auto options       = parse_options{',', '\n', '\"'};
+  options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
+  options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  options.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+
+  std::string data      = R"json(["1990","8","25"])json";
+  auto d_data           = cudf::make_string_scalar(data);
+  auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
+
+  std::size_t constexpr size = 3;
+  auto const string_offset   = std::vector<int32_t>{1, 8, 12};
+  auto const string_length   = std::vector<std::size_t>{6, 3, 4};
+  rmm::device_vector<int32_t> d_string_offset{string_offset};
+  rmm::device_vector<std::size_t> d_string_length{string_length};
+
+  auto d_col_strings =
+    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+
+  auto res_type =
+    infer_data_type(options.json_view(),
+                    {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
+                    d_col_strings,
+                    size,
+                    stream);
+
+  EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING});
+}
+
+TEST_F(TypeInference, Bool)
+{
+  auto const stream = rmm::cuda_stream_default;
+
+  auto options       = parse_options{',', '\n', '\"'};
+  options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
+  options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  options.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+
+  std::string data      = R"json([true,false,false])json";
+  auto d_data           = cudf::make_string_scalar(data);
+  auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
+
+  std::size_t constexpr size = 3;
+  auto const string_offset   = std::vector<int32_t>{1, 6, 12};
+  auto const string_length   = std::vector<std::size_t>{4, 5, 5};
+  rmm::device_vector<int32_t> d_string_offset{string_offset};
+  rmm::device_vector<std::size_t> d_string_length{string_length};
+
+  auto d_col_strings =
+    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+
+  auto res_type =
+    infer_data_type(options.json_view(),
+                    {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
+                    d_col_strings,
+                    size,
+                    stream);
+
+  EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::BOOL8});
+}
+
+TEST_F(TypeInference, Timestamp)
+{
+  auto const stream = rmm::cuda_stream_default;
+
+  auto options       = parse_options{',', '\n', '\"'};
+  options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
+  options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  options.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+
+  std::string data      = R"json([1970/2/5,1970/8/25])json";
+  auto d_data           = cudf::make_string_scalar(data);
+  auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
+
+  std::size_t constexpr size = 3;
+  auto const string_offset   = std::vector<int32_t>{1, 10};
+  auto const string_length   = std::vector<std::size_t>{8, 9};
+  rmm::device_vector<int32_t> d_string_offset{string_offset};
+  rmm::device_vector<std::size_t> d_string_length{string_length};
+
+  auto d_col_strings =
+    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+
+  auto res_type =
+    infer_data_type(options.json_view(),
+                    {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
+                    d_col_strings,
+                    size,
+                    stream);
+
+  // All data time (quoted and unquoted) is inferred as string for now
+  EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING});
+}
+
+TEST_F(TypeInference, InvalidInput)
+{
+  auto const stream = rmm::cuda_stream_default;
+
+  auto options       = parse_options{',', '\n', '\"'};
+  options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
+  options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  options.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+
+  std::string data      = R"json([1,2,3,a,5])json";
+  auto d_data           = cudf::make_string_scalar(data);
+  auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
+
+  std::size_t constexpr size = 5;
+  auto const string_offset   = std::vector<int32_t>{1, 3, 5, 7, 9};
+  auto const string_length   = std::vector<std::size_t>{1, 1, 1, 1, 1};
+  rmm::device_vector<int32_t> d_string_offset{string_offset};
+  rmm::device_vector<std::size_t> d_string_length{string_length};
+
+  auto d_col_strings =
+    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+
+  auto res_type =
+    infer_data_type(options.json_view(),
+                    {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
+                    d_col_strings,
+                    size,
+                    stream);
+
+  // Invalid input is inferred as string for now
+  EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING});
+}