diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e993f548e1d..1e1ad94ab0b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -72,7 +72,7 @@ repos:
                 args: ['-fallback-style=none']
               - id: cmake-format
                 name: cmake-format
-                entry: bash cpp/scripts/run-cmake-format.sh cmake-format
+                entry: ./cpp/scripts/run-cmake-format.sh cmake-format
                 language: python
                 types: [cmake]
                 # Note that pre-commit autoupdate does not update the versions
@@ -81,7 +81,7 @@ repos:
                   - cmake-format==0.6.11
               - id: cmake-lint
                 name: cmake-lint
-                entry: bash cpp/scripts/run-cmake-format.sh cmake-lint
+                entry: ./cpp/scripts/run-cmake-format.sh cmake-lint
                 language: python
                 types: [cmake]
                 # Note that pre-commit autoupdate does not update the versions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4dd94954a82..3ccc1ccbc8b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,9 +6,262 @@ Please see https://github.com/rapidsai/cudf/releases/tag/v22.02.00a for the late
 
 Please see https://github.com/rapidsai/cudf/releases/tag/v21.12.00a for the latest changes to this development branch.
 
-# cuDF 21.10.00 (Date TBD)
+# cuDF 21.10.00 (7 Oct 2021)
 
-Please see https://github.com/rapidsai/cudf/releases/tag/v21.10.00a for the latest changes to this development branch.
+## 🚨 Breaking Changes
+
+- Remove Cython APIs for table view generation ([#9199](https://github.com/rapidsai/cudf/pull/9199)) [@vyasr](https://github.com/vyasr)
+- Upgrade `pandas` version in `cudf` ([#9147](https://github.com/rapidsai/cudf/pull/9147)) [@galipremsagar](https://github.com/galipremsagar)
+- Make AST operators nullable ([#9096](https://github.com/rapidsai/cudf/pull/9096)) [@vyasr](https://github.com/vyasr)
+- Remove the option to pass data types as strings to `read_csv` and `read_json` ([#9079](https://github.com/rapidsai/cudf/pull/9079)) [@vuule](https://github.com/vuule)
+- Update JNI java CSV APIs to not use deprecated API ([#9066](https://github.com/rapidsai/cudf/pull/9066)) [@revans2](https://github.com/revans2)
+- Support additional format specifiers in from_timestamps ([#9047](https://github.com/rapidsai/cudf/pull/9047)) [@davidwendt](https://github.com/davidwendt)
+- Expose expression base class publicly and simplify public AST API ([#9045](https://github.com/rapidsai/cudf/pull/9045)) [@vyasr](https://github.com/vyasr)
+- Add support for struct type in ORC writer ([#9025](https://github.com/rapidsai/cudf/pull/9025)) [@vuule](https://github.com/vuule)
+- Remove aliases of various api.types APIs from utils.dtypes. ([#9011](https://github.com/rapidsai/cudf/pull/9011)) [@vyasr](https://github.com/vyasr)
+- Java bindings for conditional join output sizes ([#9002](https://github.com/rapidsai/cudf/pull/9002)) [@jlowe](https://github.com/jlowe)
+- Move compute_column API out of ast namespace ([#8957](https://github.com/rapidsai/cudf/pull/8957)) [@vyasr](https://github.com/vyasr)
+- `cudf.dtype` function ([#8949](https://github.com/rapidsai/cudf/pull/8949)) [@shwina](https://github.com/shwina)
+- Refactor Frame reductions ([#8944](https://github.com/rapidsai/cudf/pull/8944)) [@vyasr](https://github.com/vyasr)
+- Add nested column selection to parquet reader ([#8933](https://github.com/rapidsai/cudf/pull/8933)) [@devavret](https://github.com/devavret)
+- JNI Aggregation Type Changes ([#8919](https://github.com/rapidsai/cudf/pull/8919)) [@revans2](https://github.com/revans2)
+- Add groupby_aggregation and groupby_scan_aggregation classes and force their usage. ([#8906](https://github.com/rapidsai/cudf/pull/8906)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Expand CSV and JSON reader APIs to accept `dtypes` as a vector or map of `data_type` objects ([#8856](https://github.com/rapidsai/cudf/pull/8856)) [@vuule](https://github.com/vuule)
+- Change cudf docs theme to pydata theme ([#8746](https://github.com/rapidsai/cudf/pull/8746)) [@galipremsagar](https://github.com/galipremsagar)
+- Enable compiled binary ops in libcudf, python and java ([#8741](https://github.com/rapidsai/cudf/pull/8741)) [@karthikeyann](https://github.com/karthikeyann)
+- Make groupby transform-like op order match original data order ([#8720](https://github.com/rapidsai/cudf/pull/8720)) [@isVoid](https://github.com/isVoid)
+
+## 🐛 Bug Fixes
+
+- `fixed_point` `cudf::groupby` for `mean` aggregation ([#9296](https://github.com/rapidsai/cudf/pull/9296)) [@codereport](https://github.com/codereport)
+- Fix `interleave_columns` when the input string lists column having empty child column ([#9292](https://github.com/rapidsai/cudf/pull/9292)) [@ttnghia](https://github.com/ttnghia)
+- Update nvcomp to include fixes for installation of headers ([#9276](https://github.com/rapidsai/cudf/pull/9276)) [@devavret](https://github.com/devavret)
+- Fix Java column leak in testParquetWriteMap ([#9271](https://github.com/rapidsai/cudf/pull/9271)) [@jlowe](https://github.com/jlowe)
+- Fix call to thrust::reduce_by_key in argmin/argmax libcudf groupby ([#9263](https://github.com/rapidsai/cudf/pull/9263)) [@davidwendt](https://github.com/davidwendt)
+- Fixing empty input to getMapValue crashing ([#9262](https://github.com/rapidsai/cudf/pull/9262)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Fix duplicate names issue in `MultiIndex.deserialize ` ([#9258](https://github.com/rapidsai/cudf/pull/9258)) [@galipremsagar](https://github.com/galipremsagar)
+- `Dataframe.sort_index` optimizations ([#9238](https://github.com/rapidsai/cudf/pull/9238)) [@galipremsagar](https://github.com/galipremsagar)
+- Temporarily disabling problematic test in parquet writer ([#9230](https://github.com/rapidsai/cudf/pull/9230)) [@devavret](https://github.com/devavret)
+- Explicitly disable groupby on unsupported key types. ([#9227](https://github.com/rapidsai/cudf/pull/9227)) [@mythrocks](https://github.com/mythrocks)
+- Fix `gather` for sliced input structs column ([#9218](https://github.com/rapidsai/cudf/pull/9218)) [@ttnghia](https://github.com/ttnghia)
+- Fix JNI code for left semi and anti joins ([#9207](https://github.com/rapidsai/cudf/pull/9207)) [@jlowe](https://github.com/jlowe)
+- Only install thrust when using a non &#39;system&#39; version ([#9206](https://github.com/rapidsai/cudf/pull/9206)) [@robertmaynard](https://github.com/robertmaynard)
+- Remove zlib from libcudf public CMake dependencies ([#9204](https://github.com/rapidsai/cudf/pull/9204)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix out-of-bounds memory read in orc gpuEncodeOrcColumnData ([#9196](https://github.com/rapidsai/cudf/pull/9196)) [@davidwendt](https://github.com/davidwendt)
+- Fix `gather()` for `STRUCT` inputs with no nulls in members. ([#9194](https://github.com/rapidsai/cudf/pull/9194)) [@mythrocks](https://github.com/mythrocks)
+- get_cucollections properly uses rapids_cpm_find ([#9189](https://github.com/rapidsai/cudf/pull/9189)) [@robertmaynard](https://github.com/robertmaynard)
+- rapids-export correctly reference build code block and doc strings ([#9186](https://github.com/rapidsai/cudf/pull/9186)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix logic while parsing the sum statistic for numerical orc columns ([#9183](https://github.com/rapidsai/cudf/pull/9183)) [@ayushdg](https://github.com/ayushdg)
+- Add handling for nulls in `dask_cudf.sorting.quantile_divisions` ([#9171](https://github.com/rapidsai/cudf/pull/9171)) [@charlesbluca](https://github.com/charlesbluca)
+- Approximate overflow detection in ORC statistics ([#9163](https://github.com/rapidsai/cudf/pull/9163)) [@vuule](https://github.com/vuule)
+- Use decimal precision metadata when reading from parquet files ([#9162](https://github.com/rapidsai/cudf/pull/9162)) [@shwina](https://github.com/shwina)
+- Fix variable name in Java build script ([#9161](https://github.com/rapidsai/cudf/pull/9161)) [@jlowe](https://github.com/jlowe)
+- Import rapids-cmake modules using the correct cmake variable. ([#9149](https://github.com/rapidsai/cudf/pull/9149)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix conditional joins with empty left table ([#9146](https://github.com/rapidsai/cudf/pull/9146)) [@vyasr](https://github.com/vyasr)
+- Fix joining on indexes with duplicate level names ([#9137](https://github.com/rapidsai/cudf/pull/9137)) [@shwina](https://github.com/shwina)
+- Fixes missing child column name in dtype while reading ORC file. ([#9134](https://github.com/rapidsai/cudf/pull/9134)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- Apply type metadata after column is slice-copied ([#9131](https://github.com/rapidsai/cudf/pull/9131)) [@isVoid](https://github.com/isVoid)
+- Fix a bug: inner_join_size return zero if build table is empty ([#9128](https://github.com/rapidsai/cudf/pull/9128)) [@PointKernel](https://github.com/PointKernel)
+- Fix multi hive-partition parquet reading in dask-cudf ([#9122](https://github.com/rapidsai/cudf/pull/9122)) [@rjzamora](https://github.com/rjzamora)
+- Support null literals in expressions ([#9117](https://github.com/rapidsai/cudf/pull/9117)) [@vyasr](https://github.com/vyasr)
+- Fix cudf::hash_join output size for struct joins ([#9107](https://github.com/rapidsai/cudf/pull/9107)) [@jlowe](https://github.com/jlowe)
+- Import fix ([#9104](https://github.com/rapidsai/cudf/pull/9104)) [@shwina](https://github.com/shwina)
+- Fix cudf::strings::is_fixed_point checking of overflow for decimal32 ([#9093](https://github.com/rapidsai/cudf/pull/9093)) [@davidwendt](https://github.com/davidwendt)
+- Fix branch_stack calculation in `row_bit_count()` ([#9076](https://github.com/rapidsai/cudf/pull/9076)) [@mythrocks](https://github.com/mythrocks)
+- Fetch rapids-cmake to work around cuCollection cmake issue ([#9075](https://github.com/rapidsai/cudf/pull/9075)) [@jlowe](https://github.com/jlowe)
+- Fix compilation errors in groupby benchmarks. ([#9072](https://github.com/rapidsai/cudf/pull/9072)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Preserve float16 upscaling ([#9069](https://github.com/rapidsai/cudf/pull/9069)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix memcheck read error in libcudf contiguous_split ([#9067](https://github.com/rapidsai/cudf/pull/9067)) [@davidwendt](https://github.com/davidwendt)
+- Add support for reading ORC file with no row group index ([#9060](https://github.com/rapidsai/cudf/pull/9060)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- Various multiindex related fixes ([#9036](https://github.com/rapidsai/cudf/pull/9036)) [@shwina](https://github.com/shwina)
+- Avoid rebuilding cython in build.sh ([#9034](https://github.com/rapidsai/cudf/pull/9034)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add support for percentile dispatch in `dask_cudf` ([#9031](https://github.com/rapidsai/cudf/pull/9031)) [@galipremsagar](https://github.com/galipremsagar)
+- cudf resolve nvcc 11.0 compiler crashes during codegen ([#9028](https://github.com/rapidsai/cudf/pull/9028)) [@robertmaynard](https://github.com/robertmaynard)
+- Fetch correct grouping keys `agg` of dask groupby ([#9022](https://github.com/rapidsai/cudf/pull/9022)) [@galipremsagar](https://github.com/galipremsagar)
+- Allow `where()` to work with a Series and `other=cudf.NA` ([#9019](https://github.com/rapidsai/cudf/pull/9019)) [@sarahyurick](https://github.com/sarahyurick)
+- Use correct index when returning Series from `GroupBy.apply()` ([#9016](https://github.com/rapidsai/cudf/pull/9016)) [@charlesbluca](https://github.com/charlesbluca)
+- Fix `Dataframe` indexer setitem when array is passed ([#9006](https://github.com/rapidsai/cudf/pull/9006)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix ORC reading of files with struct columns that have null values ([#9005](https://github.com/rapidsai/cudf/pull/9005)) [@vuule](https://github.com/vuule)
+- Ensure JNI native libraries load when CompiledExpression loads ([#8997](https://github.com/rapidsai/cudf/pull/8997)) [@jlowe](https://github.com/jlowe)
+- Fix memory read error in get_dremel_data in page_enc.cu ([#8995](https://github.com/rapidsai/cudf/pull/8995)) [@davidwendt](https://github.com/davidwendt)
+- Fix memory write error in get_list_child_to_list_row_mapping utility ([#8994](https://github.com/rapidsai/cudf/pull/8994)) [@davidwendt](https://github.com/davidwendt)
+- Fix debug compile error for csv_test.cpp ([#8981](https://github.com/rapidsai/cudf/pull/8981)) [@davidwendt](https://github.com/davidwendt)
+- Fix memory read/write error in concatenate_lists_ignore_null ([#8978](https://github.com/rapidsai/cudf/pull/8978)) [@davidwendt](https://github.com/davidwendt)
+- Fix concatenation of `cudf.RangeIndex` ([#8970](https://github.com/rapidsai/cudf/pull/8970)) [@galipremsagar](https://github.com/galipremsagar)
+- Java conditional joins should not require matching column counts ([#8955](https://github.com/rapidsai/cudf/pull/8955)) [@jlowe](https://github.com/jlowe)
+- Fix concatenate empty structs ([#8947](https://github.com/rapidsai/cudf/pull/8947)) [@sperlingxx](https://github.com/sperlingxx)
+- Fix cuda-memcheck errors for some libcudf functions ([#8941](https://github.com/rapidsai/cudf/pull/8941)) [@davidwendt](https://github.com/davidwendt)
+- Apply series name to result of `SeriesGroupby.apply()` ([#8939](https://github.com/rapidsai/cudf/pull/8939)) [@charlesbluca](https://github.com/charlesbluca)
+- `cdef packed_columns` as `cppclass` instead of `struct` ([#8936](https://github.com/rapidsai/cudf/pull/8936)) [@charlesbluca](https://github.com/charlesbluca)
+- Inserting a `cudf.NA` into a DataFrame ([#8923](https://github.com/rapidsai/cudf/pull/8923)) [@sarahyurick](https://github.com/sarahyurick)
+- Support casting with Pandas dtype aliases ([#8920](https://github.com/rapidsai/cudf/pull/8920)) [@sarahyurick](https://github.com/sarahyurick)
+- Allow `sort_values` to accept same `kind` values as Pandas ([#8912](https://github.com/rapidsai/cudf/pull/8912)) [@sarahyurick](https://github.com/sarahyurick)
+- Enable casting to pandas nullable dtypes ([#8889](https://github.com/rapidsai/cudf/pull/8889)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Fix libcudf memory errors ([#8884](https://github.com/rapidsai/cudf/pull/8884)) [@karthikeyann](https://github.com/karthikeyann)
+- Throw KeyError when accessing field from struct with nonexistent key ([#8880](https://github.com/rapidsai/cudf/pull/8880)) [@NV-jpt](https://github.com/NV-jpt)
+- replace auto with auto&amp; ref for cast&lt;&amp;&gt; ([#8866](https://github.com/rapidsai/cudf/pull/8866)) [@karthikeyann](https://github.com/karthikeyann)
+- Add missing include&lt;optional&gt; in binops ([#8864](https://github.com/rapidsai/cudf/pull/8864)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix `select_dtypes` to work when non-class dtypes present in dataframe ([#8849](https://github.com/rapidsai/cudf/pull/8849)) [@sarahyurick](https://github.com/sarahyurick)
+- Re-enable JSON tests ([#8843](https://github.com/rapidsai/cudf/pull/8843)) [@vuule](https://github.com/vuule)
+- Support header with embedded delimiter in csv writer ([#8798](https://github.com/rapidsai/cudf/pull/8798)) [@davidwendt](https://github.com/davidwendt)
+
+## 📖 Documentation
+
+- Add IO docs page in `cudf` documentation ([#9145](https://github.com/rapidsai/cudf/pull/9145)) [@galipremsagar](https://github.com/galipremsagar)
+- use correct namespace in cuio code examples ([#9037](https://github.com/rapidsai/cudf/pull/9037)) [@cwharris](https://github.com/cwharris)
+- Restructuring `Contributing doc` ([#9026](https://github.com/rapidsai/cudf/pull/9026)) [@iskode](https://github.com/iskode)
+- Update stable version in readme ([#9008](https://github.com/rapidsai/cudf/pull/9008)) [@galipremsagar](https://github.com/galipremsagar)
+- Add spans and more include guidelines to libcudf developer guide ([#8931](https://github.com/rapidsai/cudf/pull/8931)) [@harrism](https://github.com/harrism)
+- Update Java build instructions to mention Arrow S3 and Docker ([#8867](https://github.com/rapidsai/cudf/pull/8867)) [@jlowe](https://github.com/jlowe)
+- List GDS-enabled formats in the docs ([#8805](https://github.com/rapidsai/cudf/pull/8805)) [@vuule](https://github.com/vuule)
+- Change cudf docs theme to pydata theme ([#8746](https://github.com/rapidsai/cudf/pull/8746)) [@galipremsagar](https://github.com/galipremsagar)
+
+## 🚀 New Features
+
+- Revert &quot;Add shallow hash function and shallow equality comparison for column_view ([#9185)&quot; (#9283](https://github.com/rapidsai/cudf/pull/9185)&quot; (#9283)) [@karthikeyann](https://github.com/karthikeyann)
+- Align `DataFrame.apply` signature with pandas ([#9275](https://github.com/rapidsai/cudf/pull/9275)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add struct type support for `drop_list_duplicates` ([#9202](https://github.com/rapidsai/cudf/pull/9202)) [@ttnghia](https://github.com/ttnghia)
+- support CUDA async memory resource in JNI ([#9201](https://github.com/rapidsai/cudf/pull/9201)) [@rongou](https://github.com/rongou)
+- Add shallow hash function and shallow equality comparison for column_view ([#9185](https://github.com/rapidsai/cudf/pull/9185)) [@karthikeyann](https://github.com/karthikeyann)
+- Superimpose null masks for STRUCT columns. ([#9144](https://github.com/rapidsai/cudf/pull/9144)) [@mythrocks](https://github.com/mythrocks)
+- Implemented bindings for `ceil` timestamp operation ([#9141](https://github.com/rapidsai/cudf/pull/9141)) [@shaneding](https://github.com/shaneding)
+- Adding MAP type support for ORC Reader ([#9132](https://github.com/rapidsai/cudf/pull/9132)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- Implement `interleave_columns` for lists with arbitrary nested type ([#9130](https://github.com/rapidsai/cudf/pull/9130)) [@ttnghia](https://github.com/ttnghia)
+- Add python bindings to fixed-size window and groupby `rolling.var`, `rolling.std` ([#9097](https://github.com/rapidsai/cudf/pull/9097)) [@isVoid](https://github.com/isVoid)
+- Make AST operators nullable ([#9096](https://github.com/rapidsai/cudf/pull/9096)) [@vyasr](https://github.com/vyasr)
+- Java bindings for approx_percentile ([#9094](https://github.com/rapidsai/cudf/pull/9094)) [@andygrove](https://github.com/andygrove)
+- Add `dseries.struct.explode` ([#9086](https://github.com/rapidsai/cudf/pull/9086)) [@isVoid](https://github.com/isVoid)
+- Add support for BaseIndexer in Rolling APIs ([#9085](https://github.com/rapidsai/cudf/pull/9085)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove the option to pass data types as strings to `read_csv` and `read_json` ([#9079](https://github.com/rapidsai/cudf/pull/9079)) [@vuule](https://github.com/vuule)
+- Add handling for nested dicts in dask-cudf groupby ([#9054](https://github.com/rapidsai/cudf/pull/9054)) [@charlesbluca](https://github.com/charlesbluca)
+- Added Series.dt.is_quarter_start and Series.dt.is_quarter_end ([#9046](https://github.com/rapidsai/cudf/pull/9046)) [@TravisHester](https://github.com/TravisHester)
+- Support nested types for nth_element reduction ([#9043](https://github.com/rapidsai/cudf/pull/9043)) [@sperlingxx](https://github.com/sperlingxx)
+- Update sort groupby to use non-atomic operation ([#9035](https://github.com/rapidsai/cudf/pull/9035)) [@karthikeyann](https://github.com/karthikeyann)
+- Add support for struct type in ORC writer ([#9025](https://github.com/rapidsai/cudf/pull/9025)) [@vuule](https://github.com/vuule)
+- Implement `interleave_columns` for structs columns ([#9012](https://github.com/rapidsai/cudf/pull/9012)) [@ttnghia](https://github.com/ttnghia)
+- Add groupby first and last aggregations ([#9004](https://github.com/rapidsai/cudf/pull/9004)) [@shwina](https://github.com/shwina)
+- Add `DecimalBaseColumn` and move `as_decimal_column` ([#9001](https://github.com/rapidsai/cudf/pull/9001)) [@isVoid](https://github.com/isVoid)
+- Python/Cython bindings for multibyte_split ([#8998](https://github.com/rapidsai/cudf/pull/8998)) [@jdye64](https://github.com/jdye64)
+- Support scalar `months` in `add_calendrical_months`, extends API to INT32 support ([#8991](https://github.com/rapidsai/cudf/pull/8991)) [@isVoid](https://github.com/isVoid)
+- Added Series.dt.is_month_end ([#8989](https://github.com/rapidsai/cudf/pull/8989)) [@TravisHester](https://github.com/TravisHester)
+- Support for using tdigests to compute approximate percentiles. ([#8983](https://github.com/rapidsai/cudf/pull/8983)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Support &quot;unflatten&quot; of columns flattened via `flatten_nested_columns()`: ([#8956](https://github.com/rapidsai/cudf/pull/8956)) [@mythrocks](https://github.com/mythrocks)
+- Implement timestamp ceil ([#8942](https://github.com/rapidsai/cudf/pull/8942)) [@shaneding](https://github.com/shaneding)
+- Add nested column selection to parquet reader ([#8933](https://github.com/rapidsai/cudf/pull/8933)) [@devavret](https://github.com/devavret)
+- Expose conditional join size calculation ([#8928](https://github.com/rapidsai/cudf/pull/8928)) [@vyasr](https://github.com/vyasr)
+- Support Nulls in Timeseries Generator ([#8925](https://github.com/rapidsai/cudf/pull/8925)) [@isVoid](https://github.com/isVoid)
+- Avoid index equality check in `_CPackedColumns.from_py_table()` ([#8917](https://github.com/rapidsai/cudf/pull/8917)) [@charlesbluca](https://github.com/charlesbluca)
+- Add dot product binary op ([#8909](https://github.com/rapidsai/cudf/pull/8909)) [@charlesbluca](https://github.com/charlesbluca)
+- Expose `days_in_month` function in libcudf and add python bindings ([#8892](https://github.com/rapidsai/cudf/pull/8892)) [@isVoid](https://github.com/isVoid)
+- Series string repeat ([#8882](https://github.com/rapidsai/cudf/pull/8882)) [@sarahyurick](https://github.com/sarahyurick)
+- Python binding for quarters ([#8862](https://github.com/rapidsai/cudf/pull/8862)) [@shaneding](https://github.com/shaneding)
+- Expand CSV and JSON reader APIs to accept `dtypes` as a vector or map of `data_type` objects ([#8856](https://github.com/rapidsai/cudf/pull/8856)) [@vuule](https://github.com/vuule)
+- Add Java bindings for AST transform ([#8846](https://github.com/rapidsai/cudf/pull/8846)) [@jlowe](https://github.com/jlowe)
+- Series datetime is_month_start ([#8844](https://github.com/rapidsai/cudf/pull/8844)) [@sarahyurick](https://github.com/sarahyurick)
+- Support bracket syntax for cudf::strings::replace_with_backrefs group index values ([#8841](https://github.com/rapidsai/cudf/pull/8841)) [@davidwendt](https://github.com/davidwendt)
+- Support `VARIANCE` and `STD` aggregation in rolling op ([#8809](https://github.com/rapidsai/cudf/pull/8809)) [@isVoid](https://github.com/isVoid)
+- Add quarters to libcudf datetime ([#8779](https://github.com/rapidsai/cudf/pull/8779)) [@shaneding](https://github.com/shaneding)
+- Linear Interpolation of `nan`s via `cupy` ([#8767](https://github.com/rapidsai/cudf/pull/8767)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Enable compiled binary ops in libcudf, python and java ([#8741](https://github.com/rapidsai/cudf/pull/8741)) [@karthikeyann](https://github.com/karthikeyann)
+- Make groupby transform-like op order match original data order ([#8720](https://github.com/rapidsai/cudf/pull/8720)) [@isVoid](https://github.com/isVoid)
+- multibyte_split ([#8702](https://github.com/rapidsai/cudf/pull/8702)) [@cwharris](https://github.com/cwharris)
+- Implement JNI for `strings:repeat_strings` that repeats each string separately by different numbers of times ([#8572](https://github.com/rapidsai/cudf/pull/8572)) [@ttnghia](https://github.com/ttnghia)
+
+## 🛠️ Improvements
+
+- Pin max `dask` and `distributed` versions to `2021.09.1` ([#9286](https://github.com/rapidsai/cudf/pull/9286)) [@galipremsagar](https://github.com/galipremsagar)
+- Optimized fsspec data transfer for remote file-systems ([#9265](https://github.com/rapidsai/cudf/pull/9265)) [@rjzamora](https://github.com/rjzamora)
+- Skip dask-cudf tests on arm64 ([#9252](https://github.com/rapidsai/cudf/pull/9252)) [@Ethyling](https://github.com/Ethyling)
+- Use nvcomp&#39;s snappy compressor in ORC writer ([#9242](https://github.com/rapidsai/cudf/pull/9242)) [@devavret](https://github.com/devavret)
+- Only run imports tests on x86_64 ([#9241](https://github.com/rapidsai/cudf/pull/9241)) [@Ethyling](https://github.com/Ethyling)
+- Remove unnecessary call to device_uvector::release() ([#9237](https://github.com/rapidsai/cudf/pull/9237)) [@harrism](https://github.com/harrism)
+- Use nvcomp&#39;s snappy decompression in ORC reader ([#9235](https://github.com/rapidsai/cudf/pull/9235)) [@devavret](https://github.com/devavret)
+- Add grouped_rolling test with STRUCT groupby keys. ([#9228](https://github.com/rapidsai/cudf/pull/9228)) [@mythrocks](https://github.com/mythrocks)
+- Optimize `cudf.concat` for `axis=0` ([#9222](https://github.com/rapidsai/cudf/pull/9222)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix some libcudf calls not passing the stream parameter ([#9220](https://github.com/rapidsai/cudf/pull/9220)) [@davidwendt](https://github.com/davidwendt)
+- Add min and max bounds for random dataframe generator numeric types ([#9211](https://github.com/rapidsai/cudf/pull/9211)) [@galipremsagar](https://github.com/galipremsagar)
+- Improve performance of expression evaluation ([#9210](https://github.com/rapidsai/cudf/pull/9210)) [@vyasr](https://github.com/vyasr)
+- Misc optimizations in `cudf` ([#9203](https://github.com/rapidsai/cudf/pull/9203)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove Cython APIs for table view generation ([#9199](https://github.com/rapidsai/cudf/pull/9199)) [@vyasr](https://github.com/vyasr)
+- Add JNI support for drop_list_duplicates ([#9198](https://github.com/rapidsai/cudf/pull/9198)) [@revans2](https://github.com/revans2)
+- Update pandas versions in conda recipes and requirements.txt files ([#9197](https://github.com/rapidsai/cudf/pull/9197)) [@galipremsagar](https://github.com/galipremsagar)
+- Minor C++17 cleanup of `groupby.cu`: structured bindings, more concise lambda, etc ([#9193](https://github.com/rapidsai/cudf/pull/9193)) [@codereport](https://github.com/codereport)
+- Explicit about bitwidth difference between cudf boolean and arrow boolean ([#9192](https://github.com/rapidsai/cudf/pull/9192)) [@isVoid](https://github.com/isVoid)
+- Remove _source_index from MultiIndex ([#9191](https://github.com/rapidsai/cudf/pull/9191)) [@vyasr](https://github.com/vyasr)
+- Fix typo in the name of `cudf-testing-targets.cmake` ([#9190](https://github.com/rapidsai/cudf/pull/9190)) [@trxcllnt](https://github.com/trxcllnt)
+- Add support for single-digits in cudf::to_timestamps ([#9173](https://github.com/rapidsai/cudf/pull/9173)) [@davidwendt](https://github.com/davidwendt)
+- Fix cufilejni build include path ([#9168](https://github.com/rapidsai/cudf/pull/9168)) [@pxLi](https://github.com/pxLi)
+- `dask_cudf` dispatch registering cleanup ([#9160](https://github.com/rapidsai/cudf/pull/9160)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove unneeded stream/mr from a cudf::make_strings_column ([#9148](https://github.com/rapidsai/cudf/pull/9148)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade `pandas` version in `cudf` ([#9147](https://github.com/rapidsai/cudf/pull/9147)) [@galipremsagar](https://github.com/galipremsagar)
+- make data chunk reader return unique_ptr ([#9129](https://github.com/rapidsai/cudf/pull/9129)) [@cwharris](https://github.com/cwharris)
+- Add backend for `percentile_lookup` dispatch ([#9118](https://github.com/rapidsai/cudf/pull/9118)) [@galipremsagar](https://github.com/galipremsagar)
+- Refactor implementation of column setitem ([#9110](https://github.com/rapidsai/cudf/pull/9110)) [@vyasr](https://github.com/vyasr)
+- Fix compile warnings found using nvcc 11.4 ([#9101](https://github.com/rapidsai/cudf/pull/9101)) [@davidwendt](https://github.com/davidwendt)
+- Update to UCX-Py 0.22 ([#9099](https://github.com/rapidsai/cudf/pull/9099)) [@pentschev](https://github.com/pentschev)
+- Simplify read_avro by removing unnecessary writer/impl classes ([#9090](https://github.com/rapidsai/cudf/pull/9090)) [@cwharris](https://github.com/cwharris)
+- Allowing %f in format to return nanoseconds ([#9081](https://github.com/rapidsai/cudf/pull/9081)) [@marlenezw](https://github.com/marlenezw)
+- Java bindings for cudf::hash_join ([#9080](https://github.com/rapidsai/cudf/pull/9080)) [@jlowe](https://github.com/jlowe)
+- Remove stale code in `ColumnBase._fill` ([#9078](https://github.com/rapidsai/cudf/pull/9078)) [@isVoid](https://github.com/isVoid)
+- Add support for `get_group` in GroupBy ([#9070](https://github.com/rapidsai/cudf/pull/9070)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove remaining &quot;support&quot; methods from DataFrame ([#9068](https://github.com/rapidsai/cudf/pull/9068)) [@vyasr](https://github.com/vyasr)
+- Update JNI java CSV APIs to not use deprecated API ([#9066](https://github.com/rapidsai/cudf/pull/9066)) [@revans2](https://github.com/revans2)
+- Added method to remove null_masks if the column has no nulls ([#9061](https://github.com/rapidsai/cudf/pull/9061)) [@razajafri](https://github.com/razajafri)
+- Consolidate Several Series and Dataframe Methods ([#9059](https://github.com/rapidsai/cudf/pull/9059)) [@isVoid](https://github.com/isVoid)
+- Remove usage of string based `set_dtypes` for `csv` &amp; `json` readers ([#9049](https://github.com/rapidsai/cudf/pull/9049)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove some debug print statements from gtests ([#9048](https://github.com/rapidsai/cudf/pull/9048)) [@davidwendt](https://github.com/davidwendt)
+- Support additional format specifiers in from_timestamps ([#9047](https://github.com/rapidsai/cudf/pull/9047)) [@davidwendt](https://github.com/davidwendt)
+- Expose expression base class publicly and simplify public AST API ([#9045](https://github.com/rapidsai/cudf/pull/9045)) [@vyasr](https://github.com/vyasr)
+- move filepath and mmap logic out of json/csv up to functions.cpp ([#9040](https://github.com/rapidsai/cudf/pull/9040)) [@cwharris](https://github.com/cwharris)
+- Refactor Index hierarchy ([#9039](https://github.com/rapidsai/cudf/pull/9039)) [@vyasr](https://github.com/vyasr)
+- cudf now leverages rapids-cmake to reduce CMake boilerplate ([#9030](https://github.com/rapidsai/cudf/pull/9030)) [@robertmaynard](https://github.com/robertmaynard)
+- Add support for `STRUCT` input to `groupby` ([#9024](https://github.com/rapidsai/cudf/pull/9024)) [@mythrocks](https://github.com/mythrocks)
+- Refactor Frame scans ([#9021](https://github.com/rapidsai/cudf/pull/9021)) [@vyasr](https://github.com/vyasr)
+- Remove duplicate `set_categories` code ([#9018](https://github.com/rapidsai/cudf/pull/9018)) [@isVoid](https://github.com/isVoid)
+- Map support for ParquetWriter ([#9013](https://github.com/rapidsai/cudf/pull/9013)) [@razajafri](https://github.com/razajafri)
+- Remove aliases of various api.types APIs from utils.dtypes. ([#9011](https://github.com/rapidsai/cudf/pull/9011)) [@vyasr](https://github.com/vyasr)
+- Java bindings for conditional join output sizes ([#9002](https://github.com/rapidsai/cudf/pull/9002)) [@jlowe](https://github.com/jlowe)
+- Remove _copy_construct factory ([#8999](https://github.com/rapidsai/cudf/pull/8999)) [@vyasr](https://github.com/vyasr)
+- ENH Allow arbitrary CMake config options in build.sh ([#8996](https://github.com/rapidsai/cudf/pull/8996)) [@dillon-cullinan](https://github.com/dillon-cullinan)
+- A small optimization for JNI copy column view to column vector ([#8985](https://github.com/rapidsai/cudf/pull/8985)) [@revans2](https://github.com/revans2)
+- Fix nvcc warnings in ORC writer ([#8975](https://github.com/rapidsai/cudf/pull/8975)) [@devavret](https://github.com/devavret)
+- Support nested structs in rank and dense rank ([#8962](https://github.com/rapidsai/cudf/pull/8962)) [@rwlee](https://github.com/rwlee)
+- Move compute_column API out of ast namespace ([#8957](https://github.com/rapidsai/cudf/pull/8957)) [@vyasr](https://github.com/vyasr)
+- Series datetime is_year_end and is_year_start ([#8954](https://github.com/rapidsai/cudf/pull/8954)) [@marlenezw](https://github.com/marlenezw)
+- Make Java AstNode public ([#8953](https://github.com/rapidsai/cudf/pull/8953)) [@jlowe](https://github.com/jlowe)
+- Replace allocate with device_uvector for subword_tokenize internal tables ([#8952](https://github.com/rapidsai/cudf/pull/8952)) [@davidwendt](https://github.com/davidwendt)
+- `cudf.dtype` function ([#8949](https://github.com/rapidsai/cudf/pull/8949)) [@shwina](https://github.com/shwina)
+- Refactor Frame reductions ([#8944](https://github.com/rapidsai/cudf/pull/8944)) [@vyasr](https://github.com/vyasr)
+- Add deprecation warning for `Series.set_mask` API ([#8943](https://github.com/rapidsai/cudf/pull/8943)) [@galipremsagar](https://github.com/galipremsagar)
+- Move AST evaluator into a separate header ([#8930](https://github.com/rapidsai/cudf/pull/8930)) [@vyasr](https://github.com/vyasr)
+- JNI Aggregation Type Changes ([#8919](https://github.com/rapidsai/cudf/pull/8919)) [@revans2](https://github.com/revans2)
+- Move template parameter to function parameter in cudf::detail::left_semi_anti_join ([#8914](https://github.com/rapidsai/cudf/pull/8914)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade `arrow` &amp; `pyarrow` to `5.0.0` ([#8908](https://github.com/rapidsai/cudf/pull/8908)) [@galipremsagar](https://github.com/galipremsagar)
+- Add groupby_aggregation and groupby_scan_aggregation classes and force their usage. ([#8906](https://github.com/rapidsai/cudf/pull/8906)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Move `structs_column_tests.cu` to `.cpp`. ([#8902](https://github.com/rapidsai/cudf/pull/8902)) [@mythrocks](https://github.com/mythrocks)
+- Add stream and memory-resource parameters to struct-scalar copy ctor ([#8901](https://github.com/rapidsai/cudf/pull/8901)) [@davidwendt](https://github.com/davidwendt)
+- Combine linearizer and ast_plan ([#8900](https://github.com/rapidsai/cudf/pull/8900)) [@vyasr](https://github.com/vyasr)
+- Add Java bindings for conditional join gather maps ([#8888](https://github.com/rapidsai/cudf/pull/8888)) [@jlowe](https://github.com/jlowe)
+- Remove max version pin for `dask` &amp; `distributed` on development branch ([#8881](https://github.com/rapidsai/cudf/pull/8881)) [@galipremsagar](https://github.com/galipremsagar)
+- fix cufilejni build w/ c++17 ([#8877](https://github.com/rapidsai/cudf/pull/8877)) [@pxLi](https://github.com/pxLi)
+- Add struct accessor to dask-cudf ([#8874](https://github.com/rapidsai/cudf/pull/8874)) [@NV-jpt](https://github.com/NV-jpt)
+- Migrate dask-cudf CudfEngine to leverage ArrowDatasetEngine ([#8871](https://github.com/rapidsai/cudf/pull/8871)) [@rjzamora](https://github.com/rjzamora)
+- Add JNI for extract_quarter, add_calendrical_months, and is_leap_year ([#8863](https://github.com/rapidsai/cudf/pull/8863)) [@revans2](https://github.com/revans2)
+- Change cudf::scalar copy and move constructors to protected ([#8857](https://github.com/rapidsai/cudf/pull/8857)) [@davidwendt](https://github.com/davidwendt)
+- Replace `is_same&lt;&gt;::value` with `is_same_v&lt;&gt;` ([#8852](https://github.com/rapidsai/cudf/pull/8852)) [@codereport](https://github.com/codereport)
+- Add min `pytorch` version to `importorskip` in pytest ([#8851](https://github.com/rapidsai/cudf/pull/8851)) [@galipremsagar](https://github.com/galipremsagar)
+- Java bindings for regex replace ([#8847](https://github.com/rapidsai/cudf/pull/8847)) [@jlowe](https://github.com/jlowe)
+- Remove make strings children with null mask ([#8830](https://github.com/rapidsai/cudf/pull/8830)) [@davidwendt](https://github.com/davidwendt)
+- Refactor conditional joins ([#8815](https://github.com/rapidsai/cudf/pull/8815)) [@vyasr](https://github.com/vyasr)
+- Small cleanup (unused headers / commented code removals) ([#8799](https://github.com/rapidsai/cudf/pull/8799)) [@codereport](https://github.com/codereport)
+- ENH Replace gpuci_conda_retry with gpuci_mamba_retry ([#8770](https://github.com/rapidsai/cudf/pull/8770)) [@dillon-cullinan](https://github.com/dillon-cullinan)
+- Update cudf java bindings to 21.10.0-SNAPSHOT ([#8765](https://github.com/rapidsai/cudf/pull/8765)) [@pxLi](https://github.com/pxLi)
+- Refactor and improve join benchmarks with nvbench ([#8734](https://github.com/rapidsai/cudf/pull/8734)) [@PointKernel](https://github.com/PointKernel)
+- Refactor Python factories and remove usage of Table for libcudf output handling ([#8687](https://github.com/rapidsai/cudf/pull/8687)) [@vyasr](https://github.com/vyasr)
+- Optimize URL Decoding ([#8622](https://github.com/rapidsai/cudf/pull/8622)) [@gaohao95](https://github.com/gaohao95)
+- Parquet writer dictionary encoding refactor ([#8476](https://github.com/rapidsai/cudf/pull/8476)) [@devavret](https://github.com/devavret)
+- Use nvcomp&#39;s snappy decompression in parquet reader ([#8252](https://github.com/rapidsai/cudf/pull/8252)) [@devavret](https://github.com/devavret)
+- Use nvcomp&#39;s snappy compressor in parquet writer ([#8229](https://github.com/rapidsai/cudf/pull/8229)) [@devavret](https://github.com/devavret)
 
 # cuDF 21.08.00 (4 Aug 2021)
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f83d7c5b759..aae62fbd47c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -62,12 +62,12 @@ The following instructions are for developers and contributors to cuDF OSS devel
 Compilers:
 
 * `gcc`     version 9.3+
-* `nvcc`    version 11.0+
+* `nvcc`    version 11.5+
 * `cmake`   version 3.20.1+
 
 CUDA/GPU:
 
-* CUDA 11.0+
+* CUDA 11.5+
 * NVIDIA driver 450.80.02+
 * Pascal architecture or better
 
diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index bc82f638171..979db1b5034 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
 export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
 
 # Dask & Distributed git tag
-export DASK_DISTRIBUTED_GIT_TAG='main'
+export DASK_DISTRIBUTED_GIT_TAG='2021.11.2'
 
 function remove_libcudf_kernel_cache_dir {
     EXITCODE=$?
diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
index a9bc1f4c605..746c0005a47 100755
--- a/ci/cpu/prebuild.sh
+++ b/ci/cpu/prebuild.sh
@@ -3,15 +3,7 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 set -e
 
-ARCH=$(arch)
-if [ "${ARCH}" = "x86_64" ]; then
-    DEFAULT_CUDA_VER="11.0"
-elif [ "${ARCH}" = "aarch64" ]; then
-    DEFAULT_CUDA_VER="11.2"
-else
-    echo "Unsupported arch ${ARCH}"
-    exit 1
-fi
+DEFAULT_CUDA_VER="11.5"
 
 #Always upload cudf Python package
 export UPLOAD_CUDF=1
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index e2e95c34650..8f83c169330 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 ##############################################
 # cuDF GPU build and test script for CI      #
 ##############################################
@@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
 # Dask & Distributed git tag
-export DASK_DISTRIBUTED_GIT_TAG='main'
+export DASK_DISTRIBUTED_GIT_TAG='2021.11.2'
 
 ################################################################################
 # TRAP - Setup trap for removing jitify cache
@@ -176,6 +176,28 @@ else
         ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
     done
 
+    ################################################################################
+    # MEMCHECK - Run compute-sanitizer on GoogleTest (only in nightly builds)
+    ################################################################################
+    if [[ "$BUILD_MODE" == "branch" && "$BUILD_TYPE" == "gpu" ]]; then
+        if [[ "$COMPUTE_SANITIZER_ENABLE" == "true" ]]; then
+            gpuci_logger "Memcheck on GoogleTests with rmm_mode=cuda"
+            export GTEST_CUDF_RMM_MODE=cuda
+            COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck"
+            mkdir -p "$WORKSPACE/test-results/"
+            for gt in gtests/*; do
+                test_name=$(basename ${gt})
+                if [[ "$test_name" == "ERROR_TEST" ]]; then
+                  continue
+                fi
+                echo "Running GoogleTest $test_name"
+                ${COMPUTE_SANITIZER_CMD} ${gt} | tee "$WORKSPACE/test-results/${test_name}.cs.log"
+            done
+            unset GTEST_CUDF_RMM_MODE
+            # test-results/*.cs.log are processed in gpuci
+        fi
+    fi
+
     CUDF_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf-*.tar.bz2"`
     CUDF_CONDA_FILE=`basename "$CUDF_CONDA_FILE" .tar.bz2` #get filename without extension
     CUDF_CONDA_FILE=${CUDF_CONDA_FILE//-/=} #convert to conda install
diff --git a/ci/local/README.md b/ci/local/README.md
index 96002802263..7754bcaf647 100644
--- a/ci/local/README.md
+++ b/ci/local/README.md
@@ -18,12 +18,12 @@ Build and test your local repository using a base gpuCI Docker image
 where:
     -H   Show this help text
     -r   Path to repository (defaults to working directory)
-    -i   Use Docker image (default is gpuci/rapidsai:${NIGHTLY_VERSION}-cuda10.1-devel-ubuntu16.04-py3.7)
+    -i   Use Docker image (default is gpuci/rapidsai:${NIGHTLY_VERSION}-cuda11.5-devel-ubuntu20.04-py3.8)
     -s   Skip building and testing and start an interactive shell in a container of the Docker image
 ```
 
 Example Usage:
-`bash build.sh -r ~/rapids/cudf -i gpuci/rapidsai:0.16-cuda10.2-devel-ubuntu16.04-py3.7`
+`bash build.sh -r ~/rapids/cudf -i gpuci/rapidsai:22.02-cuda11.5-devel-ubuntu20.04-py3.8`
 
 For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai/tags) page.
 
@@ -42,7 +42,7 @@ There are some caveats to be aware of when using this script, especially if you
 
 ### Docker Image Build Repository
 
-The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/cudf/build_rapidsai_cuda10.1-ubuntu16.04-py3.7/`. Feel free to remove this directory after the script is finished.
+The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/cudf/build_rapidsai_cuda11.5-ubuntu20.04-py3.8/`. Feel free to remove this directory after the script is finished.
 
 *Note*: The script *will not* override your local build repository. Your local environment stays in tact.
 
diff --git a/ci/local/build.sh b/ci/local/build.sh
index 1bfb8b63fef..345db967264 100755
--- a/ci/local/build.sh
+++ b/ci/local/build.sh
@@ -3,7 +3,7 @@
 GIT_DESCRIBE_TAG=`git describe --tags`
 MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
-DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda11.0-devel-ubuntu18.04-py3.7"
+DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda11.5-devel-ubuntu20.04-py3.8"
 REPO_PATH=${PWD}
 RAPIDS_DIR_IN_CONTAINER="/rapids"
 CPP_BUILD_DIR="cpp/build"
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index eeb76a15fcc..86432a92128 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -30,13 +30,13 @@ function sed_runner() {
 }
 
 # cpp update
-sed_runner 's/'"CUDF VERSION .* LANGUAGES"'/'"CUDF VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/CMakeLists.txt
 
 # cpp libcudf_kafka update
-sed_runner 's/'"CUDA_KAFKA VERSION .* LANGUAGES"'/'"CUDA_KAFKA VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/libcudf_kafka/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt
 
 # cpp cudf_jni update
-sed_runner 's/'"CUDF_JNI VERSION .* LANGUAGES"'/'"CUDF_JNI VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' java/src/main/native/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' java/src/main/native/CMakeLists.txt
 
 # rapids-cmake version
 sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 60a5959a23f..e7b92eddd9e 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -7,14 +7,14 @@ channels:
   - rapidsai-nightly
   - conda-forge
 dependencies:
-  - clang=11.0.0
-  - clang-tools=11.0.0
-  - cupy>7.1.0,<10.0.0a0
+  - clang=11.1.0
+  - clang-tools=11.1.0
+  - cupy>=9.5.0,<10.0.0a0
   - rmm=22.02.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
-  - numba>=0.53.1
+  - numba>=0.54
   - numpy
   - pandas>=1.0,<1.4.0dev0
   - pyarrow=5.0.0=*cuda
@@ -41,8 +41,8 @@ dependencies:
   - pydocstyle=6.1.1
   - typing_extensions
   - pre-commit
-  - dask>=2021.09.1
-  - distributed>=2021.09.1
+  - dask>=2021.11.1,<=2021.11.2
+  - distributed>=2021.11.1,<=2021.11.2
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
@@ -66,3 +66,4 @@ dependencies:
       - git+https://github.com/dask/distributed.git@main
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
+  - ptxcompiler  # [linux64]
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 7904593c4c7..6fe8ed0fafe 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -7,14 +7,14 @@ channels:
   - rapidsai-nightly
   - conda-forge
 dependencies:
-  - clang=11.0.0
-  - clang-tools=11.0.0
-  - cupy>7.1.0,<10.0.0a0
+  - clang=11.1.0
+  - clang-tools=11.1.0
+  - cupy>=9.5.0,<10.0.0a0
   - rmm=22.02.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
-  - numba>=0.53.1
+  - numba>=0.54
   - numpy
   - pandas>=1.0,<1.4.0dev0
   - pyarrow=5.0.0=*cuda
@@ -41,8 +41,8 @@ dependencies:
   - pydocstyle=6.1.1
   - typing_extensions
   - pre-commit
-  - dask>=2021.09.1
-  - distributed>=2021.09.1
+  - dask>=2021.11.1,<=2021.11.2
+  - distributed>=2021.11.1,<=2021.11.2
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
@@ -66,3 +66,4 @@ dependencies:
       - git+https://github.com/dask/distributed.git@main
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
+  - ptxcompiler  # [linux64]
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index 63800fe786b..cc8d50a1717 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -7,14 +7,14 @@ channels:
   - rapidsai-nightly
   - conda-forge
 dependencies:
-  - clang=11.0.0
-  - clang-tools=11.0.0
-  - cupy>7.1.0,<10.0.0a0
-  - rmm=21.12.*
+  - clang=11.1.0
+  - clang-tools=11.1.0
+  - cupy>=9.5.0,<10.0.0a0
+  - rmm=22.02.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
-  - numba>=0.53.1
+  - numba>=0.54
   - numpy
   - pandas>=1.0,<1.4.0dev0
   - pyarrow=5.0.0=*cuda
@@ -41,8 +41,8 @@ dependencies:
   - pydocstyle=6.1.1
   - typing_extensions
   - pre-commit
-  - dask>=2021.09.1
-  - distributed>=2021.09.1
+  - dask>=2021.11.1,<=2021.11.2
+  - distributed>=2021.11.1,<=2021.11.2
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
@@ -66,3 +66,4 @@ dependencies:
       - git+https://github.com/dask/distributed.git@main
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
+  - ptxcompiler  # [linux64]
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 615135a6f6b..46eefbc825f 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -29,7 +29,7 @@ requirements:
     - python
     - cython >=0.29,<0.30
     - setuptools
-    - numba >=0.53.1
+    - numba >=0.54
     - dlpack>=0.5,<0.6.0a0
     - pyarrow 5.0.0 *cuda
     - libcudf {{ version }}
@@ -40,8 +40,8 @@ requirements:
     - python
     - typing_extensions
     - pandas >=1.0,<1.4.0dev0
-    - cupy >7.1.0,<10.0.0a0
-    - numba >=0.53.1
+    - cupy >=9.5.0,<10.0.0a0
+    - numba >=0.54
     - numpy
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} *cuda
     - fastavro >=0.22.0
@@ -51,6 +51,7 @@ requirements:
     - nvtx >=0.2.1
     - packaging
     - cachetools
+    - ptxcompiler  # [linux64]  # CUDA enhanced compatibility. See https://github.com/rapidsai/ptxcompiler
 
 test:                                   # [linux64]
   requires:                             # [linux64]
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index dc3a17f03ab..db8af9b0bed 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -31,8 +31,8 @@ requirements:
     - python
     - streamz 
     - cudf {{ version }}
-    - dask>=2021.09.1
-    - distributed>=2021.09.1
+    - dask>=2021.11.1,<=2021.11.2
+    - distributed>=2021.11.1,<=2021.11.2
     - python-confluent-kafka
     - cudf_kafka {{ version }}
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 5631e262b87..d90de2d628c 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -4,6 +4,7 @@
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
 {% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
+{% set cuda_major=cuda_version.split('.')[0] %}
 
 package:
   name: dask-cudf
@@ -14,7 +15,7 @@ source:
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
-  string: cuda_{{ cuda_version }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  string: cuda_{{ cuda_major }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   script_env:
     - VERSION_SUFFIX
     - PARALLEL_LEVEL
@@ -26,15 +27,15 @@ requirements:
   host:
     - python
     - cudf {{ version }}
-    - dask>=2021.09.1
-    - distributed>=2021.09.1
+    - dask>=2021.11.1,<=2021.11.2
+    - distributed>=2021.11.1,<=2021.11.2
     - cudatoolkit {{ cuda_version }}
   run:
     - python
     - cudf {{ version }}
-    - dask>=2021.09.1
-    - distributed>=2021.09.1
-    - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
+    - dask>=2021.11.1,<=2021.11.2
+    - distributed>=2021.11.1,<=2021.11.2
+    - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
 
 test:                                   # [linux64]
   requires:                             # [linux64]
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index d39c7aaa39d..e78110f3233 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -117,6 +117,7 @@ test:
     - test -f $PREFIX/include/cudf/dictionary/update_keys.hpp
     - test -f $PREFIX/include/cudf/filling.hpp
     - test -f $PREFIX/include/cudf/fixed_point/fixed_point.hpp
+    - test -f $PREFIX/include/cudf/fixed_point/temporary.hpp
     - test -f $PREFIX/include/cudf/groupby.hpp
     - test -f $PREFIX/include/cudf/hashing.hpp
     - test -f $PREFIX/include/cudf/interop.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 966728d7647..86ec24c1b7b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF)
 
 project(
   CUDF
-  VERSION 21.12.00
+  VERSION 22.02.00
   LANGUAGES C CXX CUDA
 )
 
@@ -185,6 +185,7 @@ add_library(
   src/binaryop/compiled/LogicalOr.cu
   src/binaryop/compiled/Mod.cu
   src/binaryop/compiled/Mul.cu
+  src/binaryop/compiled/NullEquals.cu
   src/binaryop/compiled/NullMax.cu
   src/binaryop/compiled/NullMin.cu
   src/binaryop/compiled/PMod.cu
@@ -310,6 +311,7 @@ add_library(
   src/io/statistics/parquet_column_statistics.cu
   src/io/text/multibyte_split.cu
   src/io/utilities/column_buffer.cpp
+  src/io/utilities/config_utils.cpp
   src/io/utilities/data_sink.cpp
   src/io/utilities/datasource.cpp
   src/io/utilities/file_io_utilities.cpp
@@ -733,6 +735,27 @@ set(install_code_string
     [=[
 set(ArrowCUDA_DIR "${Arrow_DIR}")
 find_dependency(ArrowCUDA)
+]=]
+)
+
+if(CUDF_ENABLE_ARROW_PARQUET)
+  string(
+    APPEND
+    install_code_string
+    [=[
+  if(NOT Parquet_DIR)
+    set(Parquet_DIR "${Arrow_DIR}")
+  endif()
+  set(ArrowDataset_DIR "${Arrow_DIR}")
+  find_dependency(ArrowDataset)
+  ]=]
+  )
+endif()
+
+string(
+  APPEND
+  install_code_string
+  [=[
 if(testing IN_LIST cudf_FIND_COMPONENTS)
   enable_language(CUDA)
   if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake")
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index fa1e61e26fd..72b247ae748 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -208,7 +208,6 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
 # * binaryop benchmark ----------------------------------------------------------------------------
 ConfigureBench(
   BINARYOP_BENCH binaryop/binaryop_benchmark.cpp binaryop/compiled_binaryop_benchmark.cpp
-  binaryop/jit_binaryop_benchmark.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
index 9de1112a9db..314d657679b 100644
--- a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
@@ -74,14 +74,14 @@ static void BM_binaryop_transform(benchmark::State& state)
     auto const op         = cudf::binary_operator::ADD;
     auto result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
     if (reuse_columns) {
-      auto result = cudf::jit::binary_operation(columns.at(0), columns.at(0), op, result_data_type);
+      auto result = cudf::binary_operation(columns.at(0), columns.at(0), op, result_data_type);
       for (cudf::size_type i = 0; i < tree_levels - 1; i++) {
-        result = cudf::jit::binary_operation(result->view(), columns.at(0), op, result_data_type);
+        result = cudf::binary_operation(result->view(), columns.at(0), op, result_data_type);
       }
     } else {
-      auto result = cudf::jit::binary_operation(columns.at(0), columns.at(1), op, result_data_type);
+      auto result = cudf::binary_operation(columns.at(0), columns.at(1), op, result_data_type);
       std::for_each(std::next(columns.cbegin(), 2), columns.cend(), [&](auto const& col) {
-        result = cudf::jit::binary_operation(result->view(), col, op, result_data_type);
+        result = cudf::binary_operation(result->view(), col, op, result_data_type);
       });
     }
   }
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
index bc0818ace4b..8d04f8bdcb2 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
@@ -50,14 +50,14 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
 }
 
 // TODO tparam boolean for null.
-#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut)                    \
+#define BINARYOP_BENCHMARK_DEFINE(name, TypeLhs, TypeRhs, binop, TypeOut)              \
   BENCHMARK_TEMPLATE_DEFINE_F(                                                         \
-    COMPILED_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \
+    COMPILED_BINARYOP, name, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop)  \
   (::benchmark::State & st)                                                            \
   {                                                                                    \
     BM_compiled_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop); \
   }                                                                                    \
-  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, binop)                                       \
+  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name)                                        \
     ->Unit(benchmark::kMicrosecond)                                                    \
     ->UseManualTime()                                                                  \
     ->Arg(10000)      /* 10k */                                                        \
@@ -70,30 +70,36 @@ using namespace cudf;
 using namespace numeric;
 
 // clang-format off
-BINARYOP_BENCHMARK_DEFINE(float,        int64_t,      ADD,                  int32_t);
-BINARYOP_BENCHMARK_DEFINE(duration_s,   duration_D,   SUB,                  duration_ms);
-BINARYOP_BENCHMARK_DEFINE(float,        float,        MUL,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      DIV,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      TRUE_DIV,             int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      FLOOR_DIV,            int64_t);
-BINARYOP_BENCHMARK_DEFINE(double,       double,       MOD,                  double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      int64_t,      PMOD,                 double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      uint8_t,      PYMOD,                int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      POW,                  double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       LOG_BASE,             double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       ATAN2,                double);
-BINARYOP_BENCHMARK_DEFINE(int,          int,          SHIFT_LEFT,           int);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      SHIFT_RIGHT,          int);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      BITWISE_AND,          int16_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int32_t,      BITWISE_OR,           int64_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      BITWISE_XOR,          int32_t);
-BINARYOP_BENCHMARK_DEFINE(double,       int8_t,       LOGICAL_AND,          bool);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      LOGICAL_OR,           bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  EQUAL,                bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NOT_EQUAL,            bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_EQUALS,          bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NULL_MAX,             decimal32);
-BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s,   NULL_MIN,             timestamp_s);
+BINARYOP_BENCHMARK_DEFINE(ADD_1,          float,        float,        ADD,                  float);
+BINARYOP_BENCHMARK_DEFINE(ADD_2,          timestamp_s,  duration_s,   ADD,                  timestamp_s);
+BINARYOP_BENCHMARK_DEFINE(SUB_1,          duration_s,   duration_D,   SUB,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(SUB_2,          int64_t,      int64_t,      SUB,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(MUL_1,          float,        float,        MUL,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(MUL_2,          duration_s,   int64_t,      MUL,                  duration_s);
+BINARYOP_BENCHMARK_DEFINE(DIV_1,          int64_t,      int64_t,      DIV,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(DIV_2,          duration_ms,  int32_t,      DIV,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(TRUE_DIV,       int64_t,      int64_t,      TRUE_DIV,             int64_t);
+BINARYOP_BENCHMARK_DEFINE(FLOOR_DIV,      int64_t,      int64_t,      FLOOR_DIV,            int64_t);
+BINARYOP_BENCHMARK_DEFINE(MOD_1,          double,       double,       MOD,                  double);
+BINARYOP_BENCHMARK_DEFINE(MOD_2,          duration_ms,  int64_t,      MOD,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(PMOD,           int32_t,      int64_t,      PMOD,                 double);
+BINARYOP_BENCHMARK_DEFINE(PYMOD,          int32_t,      uint8_t,      PYMOD,                int64_t);
+BINARYOP_BENCHMARK_DEFINE(POW,            int64_t,      int64_t,      POW,                  double);
+BINARYOP_BENCHMARK_DEFINE(LOG_BASE,       float,        double,       LOG_BASE,             double);
+BINARYOP_BENCHMARK_DEFINE(ATAN2,          float,        double,       ATAN2,                double);
+BINARYOP_BENCHMARK_DEFINE(SHIFT_LEFT,     int,          int,          SHIFT_LEFT,           int);
+BINARYOP_BENCHMARK_DEFINE(SHIFT_RIGHT,    int16_t,      int64_t,      SHIFT_RIGHT,          int);
+BINARYOP_BENCHMARK_DEFINE(USHIFT_RIGHT,   int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
+BINARYOP_BENCHMARK_DEFINE(BITWISE_AND,    int64_t,      int32_t,      BITWISE_AND,          int16_t);
+BINARYOP_BENCHMARK_DEFINE(BITWISE_OR,     int16_t,      int32_t,      BITWISE_OR,           int64_t);
+BINARYOP_BENCHMARK_DEFINE(BITWISE_XOR,    int16_t,      int64_t,      BITWISE_XOR,          int32_t);
+BINARYOP_BENCHMARK_DEFINE(LOGICAL_AND,    double,       int8_t,       LOGICAL_AND,          bool);
+BINARYOP_BENCHMARK_DEFINE(LOGICAL_OR,     int16_t,      int64_t,      LOGICAL_OR,           bool);
+BINARYOP_BENCHMARK_DEFINE(EQUAL_1,        int32_t,      int64_t,      EQUAL,                bool);
+BINARYOP_BENCHMARK_DEFINE(EQUAL_2,        duration_ms,  duration_ns,  EQUAL,                bool);
+BINARYOP_BENCHMARK_DEFINE(NOT_EQUAL,      decimal32,    decimal32,    NOT_EQUAL,            bool);
+BINARYOP_BENCHMARK_DEFINE(LESS,           timestamp_s,  timestamp_s,  LESS,                 bool);
+BINARYOP_BENCHMARK_DEFINE(GREATER,        timestamp_ms, timestamp_s,  GREATER,              bool);
+BINARYOP_BENCHMARK_DEFINE(NULL_EQUALS,    duration_ms,  duration_ns,  NULL_EQUALS,          bool);
+BINARYOP_BENCHMARK_DEFINE(NULL_MAX,       decimal32,    decimal32,    NULL_MAX,             decimal32);
+BINARYOP_BENCHMARK_DEFINE(NULL_MIN,       timestamp_D,  timestamp_s,  NULL_MIN,             timestamp_s);
diff --git a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
deleted file mode 100644
index 7fda4a50ea1..00000000000
--- a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/binaryop.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-
-template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator>
-class JIT_BINARYOP : public cudf::benchmark {
-};
-
-template <typename TypeLhs, typename TypeRhs, typename TypeOut>
-void BM_binaryop(benchmark::State& state, cudf::binary_operator binop)
-{
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
-
-  auto data_it = thrust::make_counting_iterator(0);
-  cudf::test::fixed_width_column_wrapper<TypeLhs> input1(data_it, data_it + column_size);
-  cudf::test::fixed_width_column_wrapper<TypeRhs> input2(data_it, data_it + column_size);
-
-  auto lhs          = cudf::column_view(input1);
-  auto rhs          = cudf::column_view(input2);
-  auto output_dtype = cudf::data_type(cudf::type_to_id<TypeOut>());
-
-  // Call once for hot cache.
-  cudf::jit::binary_operation(lhs, rhs, binop, output_dtype);
-
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    cudf::jit::binary_operation(lhs, rhs, binop, output_dtype);
-  }
-}
-
-// TODO tparam boolean for null.
-#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut)               \
-  BENCHMARK_TEMPLATE_DEFINE_F(                                                    \
-    JIT_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \
-  (::benchmark::State & st)                                                       \
-  {                                                                               \
-    BM_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop);     \
-  }                                                                               \
-  BENCHMARK_REGISTER_F(JIT_BINARYOP, binop)                                       \
-    ->Unit(benchmark::kMicrosecond)                                               \
-    ->UseManualTime()                                                             \
-    ->Arg(10000)      /* 10k */                                                   \
-    ->Arg(100000)     /* 100k */                                                  \
-    ->Arg(1000000)    /* 1M */                                                    \
-    ->Arg(10000000)   /* 10M */                                                   \
-    ->Arg(100000000); /* 100M */
-
-using namespace cudf;
-using namespace numeric;
-
-// clang-format off
-BINARYOP_BENCHMARK_DEFINE(float,        int64_t,      ADD,                  int32_t);
-BINARYOP_BENCHMARK_DEFINE(duration_s,   duration_D,   SUB,                  duration_ms);
-BINARYOP_BENCHMARK_DEFINE(float,        float,        MUL,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      DIV,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      TRUE_DIV,             int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      FLOOR_DIV,            int64_t);
-BINARYOP_BENCHMARK_DEFINE(double,       double,       MOD,                  double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      int64_t,      PMOD,                 double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      uint8_t,      PYMOD,                int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      POW,                  double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       LOG_BASE,             double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       ATAN2,                double);
-BINARYOP_BENCHMARK_DEFINE(int,          int,          SHIFT_LEFT,           int);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      SHIFT_RIGHT,          int);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      BITWISE_AND,          int16_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int32_t,      BITWISE_OR,           int64_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      BITWISE_XOR,          int32_t);
-BINARYOP_BENCHMARK_DEFINE(double,       int8_t,       LOGICAL_AND,          bool);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      LOGICAL_OR,           bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  EQUAL,                bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NOT_EQUAL,            bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_EQUALS,          bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NULL_MAX,             decimal32);
-BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s,   NULL_MIN,             timestamp_s);
diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index c2ad25760b8..6ab1293ab6f 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -51,8 +51,8 @@ function(jit_preprocess_files)
 endfunction()
 
 jit_preprocess_files(
-  SOURCE_DIRECTORY ${CUDF_SOURCE_DIR}/src FILES binaryop/jit/kernel.cu
-  transform/jit/masked_udf_kernel.cu transform/jit/kernel.cu rolling/jit/kernel.cu
+  SOURCE_DIRECTORY ${CUDF_SOURCE_DIR}/src FILES binaryop/jit/kernel.cu transform/jit/kernel.cu
+  rolling/jit/kernel.cu
 )
 
 add_custom_target(
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 5fe37402096..ae1448da502 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -90,7 +90,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 
   rapids_cpm_find(
     Arrow ${VERSION}
-    GLOBAL_TARGETS arrow_shared arrow_cuda_shared
+    GLOBAL_TARGETS arrow_shared parquet_shared arrow_cuda_shared arrow_dataset_shared
     CPM_ARGS
     GIT_REPOSITORY https://github.com/apache/arrow.git
     GIT_TAG apache-arrow-${VERSION}
@@ -142,6 +142,15 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       set(ArrowCUDA_DIR "${Arrow_DIR}")
       find_package(Arrow REQUIRED QUIET)
       find_package(ArrowCUDA REQUIRED QUIET)
+      if(ENABLE_PARQUET)
+        if(NOT Parquet_DIR)
+          # Set this to enable `find_package(Parquet)`
+          set(Parquet_DIR "${Arrow_DIR}")
+        endif()
+        # Set this to enable `find_package(ArrowDataset)`
+        set(ArrowDataset_DIR "${Arrow_DIR}")
+        find_package(ArrowDataset REQUIRED QUIET)
+      endif()
     elseif(Arrow_ADDED)
       # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
       # target_include_directories. That defeats ccache.
@@ -182,24 +191,15 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
   endif()
 
   if(Arrow_ADDED)
+
     set(arrow_code_string
         [=[
-        if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared))
-            add_library(arrow_shared ALIAS cudf::arrow_shared)
-        endif()
-        if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static))
-            add_library(arrow_static ALIAS cudf::arrow_static)
-        endif()
-        ]=]
-    )
-    set(arrow_cuda_code_string
-        [=[
-        if (TARGET cudf::arrow_cuda_shared AND (NOT TARGET arrow_cuda_shared))
-            add_library(arrow_cuda_shared ALIAS cudf::arrow_cuda_shared)
-        endif()
-        if (TARGET cudf::arrow_cuda_static AND (NOT TARGET arrow_cuda_static))
-            add_library(arrow_cuda_static ALIAS cudf::arrow_cuda_static)
-        endif()
+          if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared))
+              add_library(arrow_shared ALIAS cudf::arrow_shared)
+          endif()
+          if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static))
+              add_library(arrow_static ALIAS cudf::arrow_static)
+          endif()
         ]=]
     )
 
@@ -212,6 +212,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       FINAL_CODE_BLOCK arrow_code_string
     )
 
+    set(arrow_cuda_code_string
+        [=[
+          if (TARGET cudf::arrow_cuda_shared AND (NOT TARGET arrow_cuda_shared))
+              add_library(arrow_cuda_shared ALIAS cudf::arrow_cuda_shared)
+          endif()
+          if (TARGET cudf::arrow_cuda_static AND (NOT TARGET arrow_cuda_static))
+              add_library(arrow_cuda_static ALIAS cudf::arrow_cuda_static)
+          endif()
+        ]=]
+    )
+
     rapids_export(
       BUILD ArrowCUDA
       VERSION ${VERSION}
@@ -220,6 +231,49 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       NAMESPACE cudf::
       FINAL_CODE_BLOCK arrow_cuda_code_string
     )
+
+    if(ENABLE_PARQUET)
+
+      set(arrow_dataset_code_string
+          [=[
+              if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared))
+                  add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared)
+              endif()
+              if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static))
+                  add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static)
+              endif()
+            ]=]
+      )
+
+      rapids_export(
+        BUILD ArrowDataset
+        VERSION ${VERSION}
+        EXPORT_SET arrow_dataset_targets
+        GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static
+        NAMESPACE cudf::
+        FINAL_CODE_BLOCK arrow_dataset_code_string
+      )
+
+      set(parquet_code_string
+          [=[
+              if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared))
+                  add_library(parquet_shared ALIAS cudf::parquet_shared)
+              endif()
+              if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static))
+                  add_library(parquet_static ALIAS cudf::parquet_static)
+              endif()
+            ]=]
+      )
+
+      rapids_export(
+        BUILD Parquet
+        VERSION ${VERSION}
+        EXPORT_SET parquet_targets
+        GLOBAL_TARGETS parquet_shared parquet_static
+        NAMESPACE cudf::
+        FINAL_CODE_BLOCK parquet_code_string
+      )
+    endif()
   endif()
   # We generate the arrow-config and arrowcuda-config files when we built arrow locally, so always
   # do `find_dependency`
@@ -230,10 +284,18 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
   # ArrowCUDA_DIR to be where Arrow was found, since Arrow packages ArrowCUDA.config in a
   # non-standard location
   rapids_export_package(BUILD ArrowCUDA cudf-exports)
+  if(ENABLE_PARQUET)
+    rapids_export_package(BUILD Parquet cudf-exports)
+    rapids_export_package(BUILD ArrowDataset cudf-exports)
+  endif()
 
   include("${rapids-cmake-dir}/export/find_package_root.cmake")
   rapids_export_find_package_root(BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
   rapids_export_find_package_root(BUILD ArrowCUDA [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
+  if(ENABLE_PARQUET)
+    rapids_export_find_package_root(BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
+    rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
+  endif()
 
   set(ARROW_FOUND
       "${ARROW_FOUND}"
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 55e5119040e..6a556bb4b34 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -2089,7 +2089,7 @@ ENABLE_PREPROCESSING   = YES
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-MACRO_EXPANSION        = NO
+MACRO_EXPANSION        = YES
 
 # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
 # the macro expansion is limited to the macros specified with the PREDEFINED and
@@ -2097,7 +2097,7 @@ MACRO_EXPANSION        = NO
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_ONLY_PREDEF     = NO
+EXPAND_ONLY_PREDEF     = YES
 
 # If the SEARCH_INCLUDES tag is set to YES, the include files in the
 # INCLUDE_PATH will be searched if a #include is found.
@@ -2129,7 +2129,8 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             =
+PREDEFINED              = __device__= \
+                          __host__=
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index dc800bde527..4f73cb1ef6e 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -37,9 +37,9 @@ namespace detail {
  * linearization process but cannot be explicitly created by the user.
  */
 enum class device_data_reference_type {
-  COLUMN,       // A value in a table column
-  LITERAL,      // A literal value
-  INTERMEDIATE  // An internal temporary value
+  COLUMN,       ///< A value in a table column
+  LITERAL,      ///< A literal value
+  INTERMEDIATE  ///< An internal temporary value
 };
 
 /**
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index 7ae40a7d65f..20aaa42fb68 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -122,9 +122,9 @@ enum class ast_operator {
  * This determines which table to use in cases with two tables (e.g. joins).
  */
 enum class table_reference {
-  LEFT,   // Column index in the left table
-  RIGHT,  // Column index in the right table
-  OUTPUT  // Column index in the output table
+  LEFT,   ///< Column index in the left table
+  RIGHT,  ///< Column index in the right table
+  OUTPUT  ///< Column index in the output table
 };
 
 /**
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index fe548a36cf0..a514010c1f0 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -210,83 +210,5 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
                                                          cudf::data_type const& lhs,
                                                          cudf::data_type const& rhs);
 
-namespace jit {
-/**
- * @brief Performs a binary operation between a scalar and a column.
- *
- * The output contains the result of `op(lhs, rhs[i])` for all `0 <= i < rhs.size()`
- * The scalar is the left operand and the column elements are the right operand.
- * This distinction is significant in case of non-commutative binary operations
- *
- * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
- *
- * @param lhs         The left operand scalar
- * @param rhs         The right operand column
- * @param op          The binary operator
- * @param output_type The desired data type of the output column
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return            Output column of `output_type` type containing the result of
- *                    the binary operation
- * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
- */
-std::unique_ptr<column> binary_operation(
-  scalar const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Performs a binary operation between a column and a scalar.
- *
- * The output contains the result of `op(lhs[i], rhs)` for all `0 <= i < lhs.size()`
- * The column elements are the left operand and the scalar is the right operand.
- * This distinction is significant in case of non-commutative binary operations
- *
- * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
- *
- * @param lhs         The left operand column
- * @param rhs         The right operand scalar
- * @param op          The binary operator
- * @param output_type The desired data type of the output column
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return            Output column of `output_type` type containing the result of
- *                    the binary operation
- * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
- */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  scalar const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Performs a binary operation between two columns.
- *
- * The output contains the result of `op(lhs[i], rhs[i])` for all `0 <= i < lhs.size()`
- *
- * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
- *
- * @param lhs         The left operand column
- * @param rhs         The right operand column
- * @param op          The binary operator
- * @param output_type The desired data type of the output column
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return            Output column of `output_type` type containing the result of
- *                    the binary operation
- * @throw cudf::logic_error if @p lhs and @p rhs are different sizes
- * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
- */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-}  // namespace jit
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 0b7ca2096a3..6ecb0796283 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -421,39 +421,22 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   }
 
   /**
-   * @brief Returns a `numeric::decimal32` element at the specified index for a `fixed_point`
+   * @brief Returns a `numeric::fixed_point` element at the specified index for a `fixed_point`
    * column.
    *
    * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
    * then any attempt to use the result will lead to undefined behavior.
    *
    * @param element_index Position of the desired element
-   * @return numeric::decimal32 representing the element at this index
+   * @return numeric::fixed_point representing the element at this index
    */
-  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, numeric::decimal32>)>
+  template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_point<T>())>
   __device__ T element(size_type element_index) const noexcept
   {
     using namespace numeric;
+    using rep        = typename T::rep;
     auto const scale = scale_type{_type.scale()};
-    return decimal32{scaled_integer<int32_t>{data<int32_t>()[element_index], scale}};
-  }
-
-  /**
-   * @brief Returns a `numeric::decimal64` element at the specified index for a `fixed_point`
-   * column.
-   *
-   * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
-   * then any attempt to use the result will lead to undefined behavior.
-   *
-   * @param element_index Position of the desired element
-   * @return numeric::decimal64 representing the element at this index
-   */
-  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, numeric::decimal64>)>
-  __device__ T element(size_type element_index) const noexcept
-  {
-    using namespace numeric;
-    auto const scale = scale_type{_type.scale()};
-    return decimal64{scaled_integer<int64_t>{data<int64_t>()[element_index], scale}};
+    return T{scaled_integer<rep>{data<rep>()[element_index], scale}};
   }
 
   /**
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index ba5043fb261..81dddbd284a 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -41,8 +41,8 @@ namespace cudf {
  */
 
 enum class out_of_bounds_policy : bool {
-  NULLIFY,    /// Output values corresponding to out-of-bounds indices are null
-  DONT_CHECK  /// No bounds checking is performed, better performance
+  NULLIFY,    ///< Output values corresponding to out-of-bounds indices are null
+  DONT_CHECK  ///< No bounds checking is performed, better performance
 };
 
 /**
@@ -901,8 +901,8 @@ std::unique_ptr<scalar> get_element(
  * @brief Indicates whether a row can be sampled more than once.
  */
 enum class sample_with_replacement : bool {
-  FALSE,  // A row can be sampled only once
-  TRUE    // A row can be sampled more than once
+  FALSE,  ///< A row can be sampled only once
+  TRUE    ///< A row can be sampled more than once
 };
 
 /**
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index d67984daa7c..71e5968bf07 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -378,5 +378,96 @@ std::unique_ptr<column> ceil_nanosecond(
   column_view const& column,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Round down to the nearest day
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> floor_day(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest hour
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> floor_hour(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest minute
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> floor_minute(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest second
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> floor_second(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest millisecond
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> floor_millisecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest microsecond
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> floor_microsecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest nanosecond
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> floor_nanosecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace datetime
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index 53c1f47c201..47aa7d18489 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/traits.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -131,7 +132,8 @@ struct update_target_element<
   aggregation::MIN,
   target_has_nulls,
   source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && !is_fixed_point<Source>()>> {
+  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !is_fixed_point<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -148,11 +150,13 @@ struct update_target_element<
 };
 
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::MIN,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_fixed_point<Source>()>> {
+struct update_target_element<
+  Source,
+  aggregation::MIN,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -177,7 +181,8 @@ struct update_target_element<
   aggregation::MAX,
   target_has_nulls,
   source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && !is_fixed_point<Source>()>> {
+  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !is_fixed_point<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -194,11 +199,13 @@ struct update_target_element<
 };
 
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::MAX,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_fixed_point<Source>()>> {
+struct update_target_element<
+  Source,
+  aggregation::MAX,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -223,7 +230,8 @@ struct update_target_element<
   aggregation::SUM,
   target_has_nulls,
   source_has_nulls,
-  std::enable_if_t<is_fixed_width<Source>() && !is_fixed_point<Source>()>> {
+  std::enable_if_t<is_fixed_width<Source>() && cudf::has_atomic_support<Source>() &&
+                   !is_fixed_point<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -240,11 +248,13 @@ struct update_target_element<
 };
 
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<Source,
-                             aggregation::SUM,
-                             target_has_nulls,
-                             source_has_nulls,
-                             std::enable_if_t<is_fixed_point<Source>()>> {
+struct update_target_element<
+  Source,
+  aggregation::SUM,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<is_fixed_point<Source>() &&
+                   cudf::has_atomic_support<device_storage_type_t<Source>>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -267,7 +277,8 @@ struct update_target_element<Source,
  * @brief Function object to update a single element in a target column using
  * the dictionary key addressed by the specific index.
  *
- * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a dictionary.
+ * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a
+ * dictionary.
  *
  */
 template <bool target_has_nulls = true>
@@ -581,9 +592,7 @@ struct identity_initializer {
   template <typename T, aggregation::Kind k>
   static constexpr bool is_supported()
   {
-    // Note: !is_fixed_point<T>() means that aggregations for fixed_point should happen on the
-    //       underlying type (see device_storage_type_t), not that fixed_point is not supported
-    return cudf::is_fixed_width<T>() && !is_fixed_point<T>() and
+    return cudf::is_fixed_width<T>() and
            (k == aggregation::SUM or k == aggregation::MIN or k == aggregation::MAX or
             k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL or
             k == aggregation::ARGMAX or k == aggregation::ARGMIN or
@@ -596,7 +605,8 @@ struct identity_initializer {
   std::enable_if_t<not std::is_same<corresponding_operator_t<k>, void>::value, T>
   identity_from_operator()
   {
-    return corresponding_operator_t<k>::template identity<T>();
+    using DeviceType = device_storage_type_t<T>;
+    return corresponding_operator_t<k>::template identity<DeviceType>();
   }
 
   template <typename T, aggregation::Kind k>
@@ -613,9 +623,11 @@ struct identity_initializer {
       if constexpr (cudf::is_timestamp<T>())
         return k == aggregation::ARGMAX ? T{typename T::duration(ARGMAX_SENTINEL)}
                                         : T{typename T::duration(ARGMIN_SENTINEL)};
-      else
-        return k == aggregation::ARGMAX ? static_cast<T>(ARGMAX_SENTINEL)
-                                        : static_cast<T>(ARGMIN_SENTINEL);
+      else {
+        using DeviceType = device_storage_type_t<T>;
+        return k == aggregation::ARGMAX ? static_cast<DeviceType>(ARGMAX_SENTINEL)
+                                        : static_cast<DeviceType>(ARGMIN_SENTINEL);
+      }
     }
     return identity_from_operator<T, k>();
   }
@@ -625,7 +637,11 @@ struct identity_initializer {
   std::enable_if_t<is_supported<T, k>(), void> operator()(mutable_column_view const& col,
                                                           rmm::cuda_stream_view stream)
   {
-    thrust::fill(rmm::exec_policy(stream), col.begin<T>(), col.end<T>(), get_identity<T, k>());
+    using DeviceType = device_storage_type_t<T>;
+    thrust::fill(rmm::exec_policy(stream),
+                 col.begin<DeviceType>(),
+                 col.end<DeviceType>(),
+                 get_identity<DeviceType, k>());
   }
 
   template <typename T, aggregation::Kind k>
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 69bde7f57fd..c2bd7a4893c 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1085,8 +1085,8 @@ template <typename Source, aggregation::Kind k>
 struct target_type_impl<
   Source,
   k,
-  std::enable_if_t<is_fixed_width<Source>() && not is_chrono<Source>() &&
-                   not is_fixed_point<Source>() && (k == aggregation::MEAN)>> {
+  std::enable_if_t<is_fixed_width<Source>() and not is_chrono<Source>() and
+                   not is_fixed_point<Source>() and (k == aggregation::MEAN)>> {
   using type = double;
 };
 
@@ -1113,12 +1113,13 @@ struct target_type_impl<
   using type = int64_t;
 };
 
-// Summing fixed_point numbers, always use the decimal64 accumulator
+// Summing fixed_point numbers
 template <typename Source, aggregation::Kind k>
-struct target_type_impl<Source,
-                        k,
-                        std::enable_if_t<is_fixed_point<Source>() && (k == aggregation::SUM)>> {
-  using type = numeric::decimal64;
+struct target_type_impl<
+  Source,
+  k,
+  std::enable_if_t<cudf::is_fixed_point<Source>() && (k == aggregation::SUM)>> {
+  using type = Source;
 };
 
 // Summing/Multiplying float/doubles, use same type accumulator
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index ce7731ef7d2..9fa31d0e01d 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -22,52 +22,9 @@
 namespace cudf {
 //! Inner interfaces and implementations
 namespace detail {
-namespace jit {
-/**
- * @copydoc cudf::jit::binary_operation(scalar const&, column_view const&, binary_operator,
- * data_type, rmm::mr::device_memory_resource *)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<column> binary_operation(
-  scalar const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc cudf::jit::binary_operation(column_view const&, scalar const&, binary_operator,
- * data_type, rmm::mr::device_memory_resource *)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  scalar const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @copydoc cudf::jit::binary_operation(column_view const&, column_view const&,
- * binary_operator, data_type, rmm::mr::device_memory_resource *)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-}  // namespace jit
-
-/**
- * @copydoc cudf::jit::binary_operation(column_view const&, column_view const&,
+ * @copydoc cudf::binary_operation(column_view const&, column_view const&,
  * std::string const&, data_type, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index bb5cfa5c6e0..fb4c636fcb0 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -217,12 +217,7 @@ struct DeviceType<T, std::enable_if_t<cudf::is_timestamp<T>()>> {
 };
 
 template <typename T>
-struct DeviceType<T, std::enable_if_t<std::is_same_v<numeric::decimal32, T>>> {
-  using type = typename cudf::device_storage_type_t<T>;
-};
-
-template <typename T>
-struct DeviceType<T, std::enable_if_t<std::is_same_v<numeric::decimal64, T>>> {
+struct DeviceType<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
   using type = typename cudf::device_storage_type_t<T>;
 };
 
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 594191d275d..08dbdb6f1a0 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -66,9 +66,9 @@ struct bounds_checker {
  * @brief The operation to perform when a gather map index is out of bounds
  */
 enum class gather_bitmask_op {
-  DONT_CHECK,   // Don't check for out of bounds indices
-  PASSTHROUGH,  // Preserve mask at rows with out of bounds indices
-  NULLIFY,      // Nullify rows with out of bounds indices
+  DONT_CHECK,   ///< Don't check for out of bounds indices
+  PASSTHROUGH,  ///< Preserve mask at rows with out of bounds indices
+  NULLIFY,      ///< Nullify rows with out of bounds indices
 };
 
 template <gather_bitmask_op Op, typename MapIterator>
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index deb161fd9c2..3e789299716 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -102,9 +102,8 @@ struct null_replaced_value_accessor {
                                bool has_nulls = true)
     : col{col}, null_replacement{null_val}, has_nulls{has_nulls}
   {
-    CUDF_EXPECTS(type_to_id<Element>() == device_storage_type_id(col.type().id()),
+    CUDF_EXPECTS(type_id_matches_device_storage_type<Element>(col.type().id()),
                  "the data type mismatch");
-    // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
     if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
   }
 
diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index 113c15f19a1..8e3db1c7b10 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -26,22 +26,25 @@ namespace detail {
 /**
  * @brief Computes the exclusive scan of a column.
  *
- * The null values are skipped for the operation, and if an input element
- * at `i` is null, then the output element at `i` will also be null.
+ * The null values are skipped for the operation, and if an input element at `i` is null, then the
+ * output element at `i` will also be null.
  *
- * The identity value for the column type as per the aggregation type
- * is used for the value of the first element in the output column.
+ * The identity value for the column type as per the aggregation type is used for the value of the
+ * first element in the output column.
  *
- * @throws cudf::logic_error if column data_type is not an arithmetic type.
+ * Struct columns are allowed with aggregation types Min and Max.
  *
- * @param input The input column view for the scan
- * @param agg unique_ptr to aggregation operator applied by the scan
- * @param null_handling Exclude null values when computing the result if
- *                      null_policy::EXCLUDE. Include nulls if null_policy::INCLUDE.
- *                      Any operation with a null results in a null.
+ * @throws cudf::logic_error if column data_type is not an arithmetic type or struct type but the
+ *                           `agg` is not Min or Max.
+ *
+ * @param input The input column view for the scan.
+ * @param agg unique_ptr to aggregation operator applied by the scan.
+ * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE.
+ *                      Include nulls if null_policy::INCLUDE. Any operation with a null results in
+ *                      a null.
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned scalar's device memory
- * @returns Column with scan results
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
+ * @returns Column with scan results.
  */
 std::unique_ptr<column> scan_exclusive(column_view const& input,
                                        std::unique_ptr<aggregation> const& agg,
@@ -52,22 +55,22 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
 /**
  * @brief Computes the inclusive scan of a column.
  *
- * The null values are skipped for the operation, and if an input element
- * at `i` is null, then the output element at `i` will also be null.
+ * The null values are skipped for the operation, and if an input element at `i` is null, then the
+ * output element at `i` will also be null.
  *
- * String columns are allowed with aggregation types Min and Max.
+ * String and struct columns are allowed with aggregation types Min and Max.
  *
- * @throws cudf::logic_error if column data_type is not an arithmetic type
- *                           or string type but the `agg` is not Min or Max
+ * @throws cudf::logic_error if column data_type is not an arithmetic type or string/struct types
+ *                           but the `agg` is not Min or Max.
  *
- * @param input The input column view for the scan
- * @param agg unique_ptr to aggregation operator applied by the scan
- * @param null_handling Exclude null values when computing the result if
- *                      null_policy::EXCLUDE. Include nulls if null_policy::INCLUDE.
- *                      Any operation with a null results in a null.
+ * @param input The input column view for the scan.
+ * @param agg unique_ptr to aggregation operator applied by the scan.
+ * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE.
+ *                      Include nulls if null_policy::INCLUDE. Any operation with a null results in
+ *                      a null.
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned scalar's device memory
- * @returns Column with scan results
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
+ * @returns Column with scan results.
  */
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        std::unique_ptr<aggregation> const& agg,
@@ -76,24 +79,24 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Generate row ranks for a column
+ * @brief Generate row ranks for a column.
  *
- * @param order_by Input column to generate ranks for
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return rank values
+ * @param order_by Input column to generate ranks for.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return rank values.
  */
 std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
                                             rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Generate row dense ranks for a column
+ * @brief Generate row dense ranks for a column.
  *
- * @param order_by Input column to generate ranks for
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return rank values
+ * @param order_by Input column to generate ranks for.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return rank values.
  */
 std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::cuda_stream_view stream,
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index aece79107c6..6f32e3190bf 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -28,8 +28,8 @@ namespace structs {
 namespace detail {
 
 enum class column_nullability {
-  MATCH_INCOMING,  // generate a null column if the incoming column has nulls
-  FORCE            // always generate a null column
+  MATCH_INCOMING,  ///< generate a null column if the incoming column has nulls
+  FORCE            ///< always generate a null column
 };
 
 /**
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 7524593e5ea..95605dc8a71 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -22,6 +22,7 @@
  */
 
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/fixed_point/temporary.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
@@ -119,7 +120,7 @@ struct DeviceMin {
   CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs)
     -> decltype(cudf::detail::min(lhs, rhs))
   {
-    return cudf::detail::min(lhs, rhs);
+    return numeric::detail::min(lhs, rhs);
   }
 
   template <
@@ -128,14 +129,15 @@ struct DeviceMin {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
-    return std::numeric_limits<T>::max();
+    if constexpr (cudf::is_chrono<T>()) return T::max();
+    return cuda::std::numeric_limits<T>::max();
   }
 
   template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
     CUDF_FAIL("fixed_point does not yet support DeviceMin identity");
-    return std::numeric_limits<T>::max();
+    return cuda::std::numeric_limits<T>::max();
   }
 
   // @brief identity specialized for string_view
@@ -160,7 +162,7 @@ struct DeviceMax {
   CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs)
     -> decltype(cudf::detail::max(lhs, rhs))
   {
-    return cudf::detail::max(lhs, rhs);
+    return numeric::detail::max(lhs, rhs);
   }
 
   template <
@@ -169,14 +171,15 @@ struct DeviceMax {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
-    return std::numeric_limits<T>::lowest();
+    if constexpr (cudf::is_chrono<T>()) return T::min();
+    return cuda::std::numeric_limits<T>::lowest();
   }
 
   template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
     CUDF_FAIL("fixed_point does not yet support DeviceMax identity");
-    return std::numeric_limits<T>::lowest();
+    return cuda::std::numeric_limits<T>::lowest();
   }
 
   template <typename T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index ebb21492be9..f3390d9387b 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -274,6 +274,13 @@ MurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) co
   return this->compute(key.value());
 }
 
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+MurmurHash3_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+{
+  return this->compute(key.value());
+}
+
 template <>
 hash_value_type CUDA_DEVICE_CALLABLE
 MurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
@@ -419,6 +426,13 @@ SparkMurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& ke
   return this->compute<uint64_t>(key.value());
 }
 
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+SparkMurmurHash3_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+{
+  return this->compute<__int128_t>(key.value());
+}
+
 template <>
 hash_value_type CUDA_DEVICE_CALLABLE
 SparkMurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index dc919433da7..ddedab3944c 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -22,6 +22,8 @@
  * @file Utility code involving integer arithmetic
  */
 
+#include <cudf/fixed_point/temporary.hpp>
+
 #include <cmath>
 #include <cstdlib>
 #include <stdexcept>
@@ -151,17 +153,11 @@ constexpr inline bool is_a_power_of_two(I val) noexcept
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
-{
-  return std::abs(value);
-}
-// Unsigned type just returns itself.
-template <typename T>
-std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
+constexpr inline auto absolute_value(T value) -> T
 {
+  if constexpr (cuda::std::is_signed<T>()) return numeric::detail::abs(value);
   return value;
 }
 
 }  // namespace util
-
 }  // namespace cudf
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index af2715d1290..e8223b53997 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/detail/utilities/assert.cuh>
+#include <cudf/fixed_point/temporary.hpp>
 #include <cudf/types.hpp>
 
 // Note: The <cuda/std/*> versions are used in order for Jitify to work with our fixed_point type.
@@ -48,13 +49,15 @@ enum class Radix : int32_t { BASE_2 = 2, BASE_10 = 10 };
 template <typename T>
 constexpr inline auto is_supported_representation_type()
 {
-  return cuda::std::is_same_v<T, int32_t> || cuda::std::is_same_v<T, int64_t>;
+  return cuda::std::is_same_v<T, int32_t> ||  //
+         cuda::std::is_same_v<T, int64_t> ||  //
+         cuda::std::is_same_v<T, __int128_t>;
 }
 
 template <typename T>
 constexpr inline auto is_supported_construction_value_type()
 {
-  return cuda::std::is_integral<T>::value || cuda::std::is_floating_point<T>::value;
+  return cuda::std::is_integral<T>() || cuda::std::is_floating_point<T>::value;
 }
 
 // Helper functions for `fixed_point` type
@@ -551,17 +554,18 @@ class fixed_point {
   explicit operator std::string() const
   {
     if (_scale < 0) {
-      auto const av   = std::abs(_value);
-      int64_t const n = std::pow(10, -_scale);
-      int64_t const f = av % n;
+      auto const av = detail::abs(_value);
+      Rep const n   = detail::exp10<Rep>(-_scale);
+      Rep const f   = av % n;
       auto const num_zeros =
-        std::max(0, (-_scale - static_cast<int32_t>(std::to_string(f).size())));
+        std::max(0, (-_scale - static_cast<int32_t>(detail::to_string(f).size())));
       auto const zeros = std::string(num_zeros, '0');
       auto const sign  = _value < 0 ? std::string("-") : std::string();
-      return sign + std::to_string(av / n) + std::string(".") + zeros + std::to_string(av % n);
+      return sign + detail::to_string(av / n) + std::string(".") + zeros +
+             detail::to_string(av % n);
     } else {
       auto const zeros = std::string(_scale, '0');
-      return std::to_string(_value) + zeros;
+      return detail::to_string(_value) + zeros;
     }
   }
 };
@@ -752,8 +756,9 @@ CUDA_HOST_DEVICE_CALLABLE bool operator>(fixed_point<Rep1, Rad1> const& lhs,
   return lhs.rescaled(scale)._value > rhs.rescaled(scale)._value;
 }
 
-using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
-using decimal64 = fixed_point<int64_t, Radix::BASE_10>;
+using decimal32  = fixed_point<int32_t, Radix::BASE_10>;
+using decimal64  = fixed_point<int64_t, Radix::BASE_10>;
+using decimal128 = fixed_point<__int128_t, Radix::BASE_10>;
 
 /** @} */  // end of group
 }  // namespace numeric
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
new file mode 100644
index 00000000000..2b50e273517
--- /dev/null
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+
+// Note: The <cuda/std/*> versions are used in order for Jitify to work with our fixed_point type.
+//       Jitify is needed for several algorithms (binaryop, rolling, etc)
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include <algorithm>
+#include <string>
+
+namespace numeric {
+namespace detail {
+
+template <typename T>
+auto to_string(T value) -> std::string
+{
+  if constexpr (cuda::std::is_same_v<T, __int128_t>) {
+    auto s          = std::string{};
+    auto const sign = value < 0;
+    if (sign) {
+      value += 1;  // avoid overflowing if value == _int128_t lowest
+      value *= -1;
+      if (value == cuda::std::numeric_limits<__int128_t>::max())
+        return "-170141183460469231731687303715884105728";
+      value += 1;  // can add back the one, no need to avoid overflow anymore
+    }
+    while (value) {
+      s.push_back("0123456789"[value % 10]);
+      value /= 10;
+    }
+    if (sign) s.push_back('-');
+    std::reverse(s.begin(), s.end());
+    return s;
+  } else {
+    return std::to_string(value);
+  }
+  return std::string{};  // won't ever hit here, need to supress warning though
+}
+
+template <typename T>
+constexpr auto abs(T value)
+{
+  return value >= 0 ? value : -value;
+}
+
+template <typename T>
+CUDA_HOST_DEVICE_CALLABLE auto min(T lhs, T rhs)
+{
+  return lhs < rhs ? lhs : rhs;
+}
+
+template <typename T>
+CUDA_HOST_DEVICE_CALLABLE auto max(T lhs, T rhs)
+{
+  return lhs > rhs ? lhs : rhs;
+}
+
+template <typename BaseType>
+constexpr auto exp10(int32_t exponent)
+{
+  BaseType value = 1;
+  while (exponent > 0)
+    value *= 10, --exponent;
+  return value;
+}
+
+}  // namespace detail
+}  // namespace numeric
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index aac44bed50e..c190340f6c1 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,45 +40,23 @@ table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr);
 
-class writer {
- public:
-  class impl;
-
- private:
-  std::unique_ptr<impl> _impl;
-
- public:
-  /**
-   * @brief Constructor for output to a file.
-   *
-   * @param sinkp The data sink to write the data to
-   * @param options Settings for controlling writing behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  writer(std::unique_ptr<cudf::io::data_sink> sinkp,
-         csv_writer_options const& options,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr);  // cannot provide definition here (because
-                                                // _impl is incomplete hence unique_ptr has
-                                                // not enough sizeof() info)
-
-  /**
-   * @brief Destructor explicitly-declared to avoid inlined in header
-   */
-  ~writer();
+/**
+ * @brief Write an entire dataset to CSV format.
+ *
+ * @param sink Output sink
+ * @param table The set of columns
+ * @param metadata The metadata associated with the table
+ * @param options Settings for controlling behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource to use for device memory allocation
+ */
+void write_csv(data_sink* sink,
+               table_view const& table,
+               const table_metadata* metadata,
+               csv_writer_options const& options,
+               rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-  /**
-   * @brief Writes the entire dataset.
-   *
-   * @param table Set of columns to output
-   * @param metadata Table metadata and column names
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write(table_view const& table,
-             const table_metadata* metadata = nullptr,
-             rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-};
 }  // namespace csv
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 14f27ef8eef..98922ad10a4 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -148,7 +148,7 @@ class writer {
    * @param[in] metadata_list List of input file metadata
    * @return A parquet-compatible blob that contains the data for all rowgroups in the list
    */
-  static std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
+  static std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
     const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list);
 };
 
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 2a95b85465b..3bc2e6c9ef2 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -70,6 +70,9 @@ class orc_reader_options {
   // Columns that should be converted from Decimal to Float64
   std::vector<std::string> _decimal_cols_as_float;
 
+  // Columns that should be read as Decimal128
+  std::vector<std::string> _decimal128_columns;
+
   friend orc_reader_options_builder;
 
   /**
@@ -136,13 +139,18 @@ class orc_reader_options {
   data_type get_timestamp_type() const { return _timestamp_type; }
 
   /**
-   * @brief Columns that should be converted from Decimal to Float64.
+   * @brief Fully qualified names of columns that should be converted from Decimal to Float64.
    */
   std::vector<std::string> const& get_decimal_cols_as_float() const
   {
     return _decimal_cols_as_float;
   }
 
+  /**
+   * @brief Fully qualified names of columns that should be read as 128-bit Decimal.
+   */
+  std::vector<std::string> const& get_decimal128_columns() const { return _decimal128_columns; }
+
   // Setters
 
   /**
@@ -210,12 +218,22 @@ class orc_reader_options {
   /**
    * @brief Set columns that should be converted from Decimal to Float64
    *
-   * @param val Vector of column names.
+   * @param val Vector of fully qualified column names.
    */
   void set_decimal_cols_as_float(std::vector<std::string> val)
   {
     _decimal_cols_as_float = std::move(val);
   }
+
+  /**
+   * @brief Set columns that should be read as 128-bit Decimal
+   *
+   * @param val Vector of fully qualified column names.
+   */
+  void set_decimal128_columns(std::vector<std::string> val)
+  {
+    _decimal128_columns = std::move(val);
+  }
 };
 
 class orc_reader_options_builder {
@@ -332,6 +350,18 @@ class orc_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Columns that should be read as 128-bit Decimal
+   *
+   * @param val Vector of column names.
+   * @return this for chaining.
+   */
+  orc_reader_options_builder& decimal128_columns(std::vector<std::string> val)
+  {
+    options._decimal128_columns = std::move(val);
+    return *this;
+  }
+
   /**
    * @brief move orc_reader_options member once it's built.
    */
@@ -445,24 +475,24 @@ class orc_writer_options {
   /**
    * @brief Whether writing column statistics is enabled/disabled.
    */
-  bool enable_statistics() const { return _enable_statistics; }
+  bool is_enabled_statistics() const { return _enable_statistics; }
 
   /**
    * @brief Returns maximum stripe size, in bytes.
    */
-  auto stripe_size_bytes() const { return _stripe_size_bytes; }
+  auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
 
   /**
    * @brief Returns maximum stripe size, in rows.
    */
-  auto stripe_size_rows() const { return _stripe_size_rows; }
+  auto get_stripe_size_rows() const { return _stripe_size_rows; }
 
   /**
    * @brief Returns the row index stride.
    */
-  auto row_index_stride() const
+  auto get_row_index_stride() const
   {
-    auto const unaligned_stride = std::min(_row_index_stride, stripe_size_rows());
+    auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
     return unaligned_stride - unaligned_stride % 8;
   }
 
@@ -739,24 +769,24 @@ class chunked_orc_writer_options {
   /**
    * @brief Whether writing column statistics is enabled/disabled.
    */
-  bool enable_statistics() const { return _enable_statistics; }
+  bool is_enabled_statistics() const { return _enable_statistics; }
 
   /**
    * @brief Returns maximum stripe size, in bytes.
    */
-  auto stripe_size_bytes() const { return _stripe_size_bytes; }
+  auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
 
   /**
    * @brief Returns maximum stripe size, in rows.
    */
-  auto stripe_size_rows() const { return _stripe_size_rows; }
+  auto get_stripe_size_rows() const { return _stripe_size_rows; }
 
   /**
    * @brief Returns the row index stride.
    */
-  auto row_index_stride() const
+  auto get_row_index_stride() const
   {
-    auto const unaligned_stride = std::min(_row_index_stride, stripe_size_rows());
+    auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
     return unaligned_stride - unaligned_stride % 8;
   }
 
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 660ec051304..88cf7416506 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -37,6 +37,9 @@ namespace io {
  * @file
  */
 
+constexpr size_t default_row_group_size_bytes   = 128 * 1024 * 1024;  // 128MB
+constexpr size_type default_row_group_size_rows = 1000000;
+
 /**
  * @brief Builds parquet_reader_options to use for `read_parquet()`.
  */
@@ -398,6 +401,10 @@ class parquet_writer_options {
   bool _write_timestamps_as_int96 = false;
   // Column chunks file path to be set in the raw output metadata
   std::string _column_chunks_file_path;
+  // Maximum size of each row group (unless smaller than a single page)
+  size_t _row_group_size_bytes = default_row_group_size_bytes;
+  // Maximum number of rows in row group (unless smaller than a single page)
+  size_type _row_group_size_rows = default_row_group_size_rows;
 
   /**
    * @brief Constructor from sink and table.
@@ -472,6 +479,16 @@ class parquet_writer_options {
    */
   std::string get_column_chunks_file_path() const { return _column_chunks_file_path; }
 
+  /**
+   * @brief Returns maximum row group size, in bytes.
+   */
+  auto get_row_group_size_bytes() const { return _row_group_size_bytes; }
+
+  /**
+   * @brief Returns maximum row group size, in rows.
+   */
+  auto get_row_group_size_rows() const { return _row_group_size_rows; }
+
   /**
    * @brief Sets metadata.
    *
@@ -510,6 +527,28 @@ class parquet_writer_options {
   {
     _column_chunks_file_path.assign(file_path);
   }
+
+  /**
+   * @brief Sets the maximum row group size, in bytes.
+   */
+  void set_row_group_size_bytes(size_t size_bytes)
+  {
+    CUDF_EXPECTS(
+      size_bytes >= 512 * 1024,
+      "The maximum row group size cannot be smaller than the page size, which is 512KB.");
+    _row_group_size_bytes = size_bytes;
+  }
+
+  /**
+   * @brief Sets the maximum row group size, in rows.
+   */
+  void set_row_group_size_rows(size_type size_rows)
+  {
+    CUDF_EXPECTS(
+      size_rows >= 5000,
+      "The maximum row group size cannot be smaller than the page size, which is 5000 rows.");
+    _row_group_size_rows = size_rows;
+  }
 };
 
 class parquet_writer_options_builder {
@@ -582,6 +621,30 @@ class parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets the maximum row group size, in bytes.
+   *
+   * @param val maximum row group size
+   * @return this for chaining.
+   */
+  parquet_writer_options_builder& row_group_size_bytes(size_t val)
+  {
+    options.set_row_group_size_bytes(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets the maximum number of rows in output row groups.
+   *
+   * @param val maximum number or rows
+   * @return this for chaining.
+   */
+  parquet_writer_options_builder& row_group_size_rows(size_type val)
+  {
+    options.set_row_group_size_rows(val);
+    return *this;
+  }
+
   /**
    * @brief Sets whether int96 timestamps are written or not in parquet_writer_options.
    *
@@ -637,7 +700,7 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(
  * @param[in] metadata_list List of input file metadata.
  * @return A parquet-compatible blob that contains the data for all row groups in the list.
  */
-std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
+std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
   const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list);
 
 /**
@@ -660,6 +723,10 @@ class chunked_parquet_writer_options {
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
+  // Maximum size of each row group (unless smaller than a single page)
+  size_t _row_group_size_bytes = default_row_group_size_bytes;
+  // Maximum number of rows in row group (unless smaller than a single page)
+  size_type _row_group_size_rows = default_row_group_size_rows;
 
   /**
    * @brief Constructor from sink.
@@ -703,6 +770,16 @@ class chunked_parquet_writer_options {
    */
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
+  /**
+   * @brief Returns maximum row group size, in bytes.
+   */
+  auto get_row_group_size_bytes() const { return _row_group_size_bytes; }
+
+  /**
+   * @brief Returns maximum row group size, in rows.
+   */
+  auto get_row_group_size_rows() const { return _row_group_size_rows; }
+
   /**
    * @brief Sets metadata.
    *
@@ -732,6 +809,28 @@ class chunked_parquet_writer_options {
    */
   void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }
 
+  /**
+   * @brief Sets the maximum row group size, in bytes.
+   */
+  void set_row_group_size_bytes(size_t size_bytes)
+  {
+    CUDF_EXPECTS(
+      size_bytes >= 512 * 1024,
+      "The maximum row group size cannot be smaller than the page size, which is 512KB.");
+    _row_group_size_bytes = size_bytes;
+  }
+
+  /**
+   * @brief Sets the maximum row group size, in rows.
+   */
+  void set_row_group_size_rows(size_type size_rows)
+  {
+    CUDF_EXPECTS(
+      size_rows >= 5000,
+      "The maximum row group size cannot be smaller than the page size, which is 5000 rows.");
+    _row_group_size_rows = size_rows;
+  }
+
   /**
    * @brief creates builder to build chunked_parquet_writer_options.
    *
@@ -811,6 +910,30 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets the maximum row group size, in bytes.
+   *
+   * @param val maximum row group size
+   * @return this for chaining.
+   */
+  chunked_parquet_writer_options_builder& row_group_size_bytes(size_t val)
+  {
+    options.set_row_group_size_bytes(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets the maximum number of rows in output row groups.
+   *
+   * @param val maximum number or rows
+   * @return this for chaining.
+   */
+  chunked_parquet_writer_options_builder& row_group_size_rows(size_type val)
+  {
+    options.set_row_group_size_rows(val);
+    return *this;
+  }
+
   /**
    * @brief move chunked_parquet_writer_options member once it's built.
    */
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index ac965e2d416..cf6be8a20af 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -87,9 +87,9 @@ enum class quote_style {
  * @brief Column statistics granularity type for parquet/orc writers
  */
 enum statistics_freq {
-  STATISTICS_NONE     = 0,  //!< No column statistics
-  STATISTICS_ROWGROUP = 1,  //!< Per-Rowgroup column statistics
-  STATISTICS_PAGE     = 2,  //!< Per-page column statistics
+  STATISTICS_NONE     = 0,  ///< No column statistics
+  STATISTICS_ROWGROUP = 1,  ///< Per-Rowgroup column statistics
+  STATISTICS_PAGE     = 2,  ///< Per-page column statistics
 };
 
 /**
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index a9407ed57ca..61a81e8a745 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -26,7 +26,7 @@ namespace lists {
  * @file
  */
 
-/*
+/**
  * @brief Flag to specify whether a null list element will be ignored from concatenation, or the
  * entire concatenation result involving null list elements will be a null element.
  */
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index aa98c2e6404..dc2df368bae 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -359,7 +359,7 @@ class fixed_point_scalar : public scalar {
   rep_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
-   * @brief Get the decimal32 or decimal64.
+   * @brief Get the decimal32, decimal64 or decimal128.
    *
    * @param stream CUDA stream used for device memory operations.
    */
diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh
index 884b412d3e2..56afa150dfc 100644
--- a/cpp/include/cudf/scalar/scalar_device_view.cuh
+++ b/cpp/include/cudf/scalar/scalar_device_view.cuh
@@ -91,6 +91,12 @@ class fixed_width_scalar_device_view_base : public detail::scalar_device_view_ba
     return *data<T>();
   }
 
+  /**
+   * @brief Stores the value in scalar
+   *
+   * @tparam T The desired type
+   * @param value The value to store in scalar
+   */
   template <typename T>
   __device__ void set_value(T value)
   {
@@ -159,6 +165,11 @@ class fixed_width_scalar_device_view : public detail::fixed_width_scalar_device_
     return fixed_width_scalar_device_view_base::value<T>();
   }
 
+  /**
+   * @brief Stores the value in scalar
+   *
+   * @param value The value to store in scalar
+   */
   __device__ void set_value(T value) { fixed_width_scalar_device_view_base::set_value<T>(value); }
 
   /**
@@ -218,6 +229,11 @@ class fixed_point_scalar_device_view : public detail::scalar_device_view_base {
   {
   }
 
+  /**
+   * @brief Stores the value in scalar
+   *
+   * @param value The value to store in scalar
+   */
   __device__ void set_value(rep_type value) { *_data = value; }
 
   /**
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index 2af79de0716..04d65065bd3 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -37,16 +37,16 @@ namespace strings {
  * does not match to any explicitly named enumerator.
  */
 enum string_character_types : uint32_t {
-  DECIMAL    = 1 << 0,                             /// all decimal characters
-  NUMERIC    = 1 << 1,                             /// all numeric characters
-  DIGIT      = 1 << 2,                             /// all digit characters
-  ALPHA      = 1 << 3,                             /// all alphabetic characters
-  SPACE      = 1 << 4,                             /// all space characters
-  UPPER      = 1 << 5,                             /// all upper case characters
-  LOWER      = 1 << 6,                             /// all lower case characters
-  ALPHANUM   = DECIMAL | NUMERIC | DIGIT | ALPHA,  /// all alphanumeric characters
-  CASE_TYPES = UPPER | LOWER,                      /// all case-able characters
-  ALL_TYPES  = ALPHANUM | CASE_TYPES | SPACE       /// all character types
+  DECIMAL    = 1 << 0,                             ///< all decimal characters
+  NUMERIC    = 1 << 1,                             ///< all numeric characters
+  DIGIT      = 1 << 2,                             ///< all digit characters
+  ALPHA      = 1 << 3,                             ///< all alphabetic characters
+  SPACE      = 1 << 4,                             ///< all space characters
+  UPPER      = 1 << 5,                             ///< all upper case characters
+  LOWER      = 1 << 6,                             ///< all lower case characters
+  ALPHANUM   = DECIMAL | NUMERIC | DIGIT | ALPHA,  ///< all alphanumeric characters
+  CASE_TYPES = UPPER | LOWER,                      ///< all case-able characters
+  ALL_TYPES  = ALPHANUM | CASE_TYPES | SPACE       ///< all character types
 };
 
 /**
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index ec22186ea99..279bf44e7fc 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -50,7 +50,7 @@ namespace strings {
  *
  * @param input Lists column to format.
  * @param na_rep Replacment string for null elements.
- * @param separator Strings to use for enclosing list components and separating elements.
+ * @param separators Strings to use for enclosing list components and separating elements.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index 56205c161b1..aa3f544202f 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -17,7 +17,7 @@
 #include <thrust/optional.h>
 #include <thrust/pair.h>
 
-#include <type_traits>
+#include <cuda/std/type_traits>
 
 namespace cudf {
 namespace strings {
@@ -46,7 +46,7 @@ __device__ inline thrust::pair<UnsignedDecimalType, int32_t> parse_integer(
   constexpr UnsignedDecimalType decimal_max =
     (std::numeric_limits<UnsignedDecimalType>::max() - 9L) / 10L;
 
-  uint64_t value     = 0;  // for checking overflow
+  __uint128_t value  = 0;  // for checking overflow
   int32_t exp_offset = 0;
   bool decimal_found = false;
 
@@ -137,7 +137,7 @@ __device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int
   // if string begins with a sign, continue with next character
   if (sign != 0) ++iter;
 
-  using UnsignedDecimalType = std::make_unsigned_t<DecimalType>;
+  using UnsignedDecimalType = cuda::std::make_unsigned_t<DecimalType>;
   auto [value, exp_offset]  = parse_integer<UnsignedDecimalType>(iter, iter_end);
   if (value == 0) { return DecimalType{0}; }
 
@@ -150,11 +150,9 @@ __device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int
   exp_ten += exp_offset;
 
   // shift the output value based on the exp_ten and the scale values
-  if (exp_ten < scale) {
-    value = value / static_cast<UnsignedDecimalType>(exp10(static_cast<double>(scale - exp_ten)));
-  } else {
-    value = value * static_cast<UnsignedDecimalType>(exp10(static_cast<double>(exp_ten - scale)));
-  }
+  value = exp_ten < scale
+            ? value / static_cast<UnsignedDecimalType>(exp10(static_cast<double>(scale - exp_ten)))
+            : value * static_cast<UnsignedDecimalType>(exp10(static_cast<double>(exp_ten - scale)));
 
   return static_cast<DecimalType>(value) * (sign == 0 ? 1 : sign);
 }
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index ec4a88a0e46..eb7258830ce 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -315,7 +315,7 @@ std::unique_ptr<cudf::column> gather(
     d_out_offsets + output_count,
     [] __device__(auto size) { return static_cast<size_t>(size); },
     size_t{0},
-    thrust::plus<size_t>{});
+    thrust::plus{});
   CUDF_EXPECTS(total_bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total size of output strings is too large for a cudf column");
 
diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp
index f6aee6d22cc..637b3b0851b 100644
--- a/cpp/include/cudf/strings/regex/flags.hpp
+++ b/cpp/include/cudf/strings/regex/flags.hpp
@@ -33,9 +33,9 @@ namespace strings {
  * and to match the Python flag values.
  */
 enum regex_flags : uint32_t {
-  DEFAULT   = 0,  /// default
-  MULTILINE = 8,  /// the '^' and '$' honor new-line characters
-  DOTALL    = 16  /// the '.' matching includes new-line characters
+  DEFAULT   = 0,  ///< default
+  MULTILINE = 8,  ///< the '^' and '$' honor new-line characters
+  DOTALL    = 16  ///< the '.' matching includes new-line characters
 };
 
 /**
diff --git a/cpp/include/cudf/strings/string.cuh b/cpp/include/cudf/strings/string.cuh
index 82da5ad8f10..d85d19d7f10 100644
--- a/cpp/include/cudf/strings/string.cuh
+++ b/cpp/include/cudf/strings/string.cuh
@@ -52,6 +52,43 @@ inline __device__ bool is_integer(string_view const& d_str)
            thrust::seq, begin, end, [] __device__(auto chr) { return chr >= '0' && chr <= '9'; });
 }
 
+/**
+ * @brief Returns true if input contains the not-a-number string.
+ *
+ * The following are valid for this function: "NAN" and "NaN"
+ * @param d_str input string
+ * @return true if input is as valid NaN string.
+ */
+inline __device__ bool is_nan_str(string_view const& d_str)
+{
+  auto const ptr = d_str.data();
+  return (d_str.size_bytes() == 3) && (ptr[0] == 'N' || ptr[0] == 'n') &&
+         (ptr[1] == 'A' || ptr[1] == 'a') && (ptr[2] == 'N' || ptr[2] == 'n');
+}
+
+/**
+ * @brief Returns true if input contains the infinity string.
+ *
+ * The following are valid for this function: "INF", "INFINITY", and "Inf"
+ * @param d_str input string
+ * @return true if input is as valid Inf string.
+ */
+inline __device__ bool is_inf_str(string_view const& d_str)
+{
+  auto const ptr  = d_str.data();
+  auto const size = d_str.size_bytes();
+
+  if (size != 3 && size != 8) return false;
+
+  auto const prefix_valid = (ptr[0] == 'I' || ptr[0] == 'i') && (ptr[1] == 'N' || ptr[1] == 'n') &&
+                            (ptr[2] == 'F' || ptr[2] == 'f');
+
+  return prefix_valid &&
+         ((size == 3) || ((ptr[3] == 'I' || ptr[3] == 'i') && (ptr[4] == 'N' || ptr[4] == 'n') &&
+                          (ptr[5] == 'I' || ptr[5] == 'i') && (ptr[6] == 'T' || ptr[6] == 't') &&
+                          (ptr[7] == 'Y' || ptr[7] == 'y')));
+}
+
 /**
  * @brief Returns `true` if all characters in the string
  * are valid for conversion to a float type.
@@ -65,8 +102,8 @@ inline __device__ bool is_integer(string_view const& d_str)
  * An empty string returns `false`.
  * No bounds checking is performed to verify if the value would fit
  * within a specific float type.
- * The following strings are also allowed "NaN", "Inf" and, "-Inf"
- * and will return true.
+ * The following strings are also allowed and will return true:
+ *  "NaN", "NAN", "Inf", "INF", "INFINITY"
  *
  * @param d_str String to check.
  * @return true if string has valid float characters
@@ -74,29 +111,32 @@ inline __device__ bool is_integer(string_view const& d_str)
 inline __device__ bool is_float(string_view const& d_str)
 {
   if (d_str.empty()) return false;
-  // strings allowed by the converter
-  if (d_str.compare("NaN", 3) == 0) return true;
-  if (d_str.compare("Inf", 3) == 0) return true;
-  if (d_str.compare("-Inf", 4) == 0) return true;
   bool decimal_found  = false;
   bool exponent_found = false;
   size_type bytes     = d_str.size_bytes();
   const char* data    = d_str.data();
   // sign character allowed at the beginning of the string
-  size_type chidx = (*data == '-' || *data == '+') ? 1 : 0;
-  bool result     = chidx < bytes;
+  size_type ch_idx = (*data == '-' || *data == '+') ? 1 : 0;
+
+  bool result = ch_idx < bytes;
+  // check for nan and infinity strings
+  if (result && data[ch_idx] > '9') {
+    auto const inf_nan = string_view(data + ch_idx, bytes - ch_idx);
+    if (is_nan_str(inf_nan) || is_inf_str(inf_nan)) return true;
+  }
+
   // check for float chars [0-9] and a single decimal '.'
   // and scientific notation [eE][+-][0-9]
-  for (; chidx < bytes; ++chidx) {
-    auto chr = data[chidx];
+  for (; ch_idx < bytes; ++ch_idx) {
+    auto chr = data[ch_idx];
     if (chr >= '0' && chr <= '9') continue;
     if (!decimal_found && chr == '.') {
       decimal_found = true;  // no more decimals
       continue;
     }
     if (!exponent_found && (chr == 'e' || chr == 'E')) {
-      if (chidx + 1 < bytes) chr = data[chidx + 1];
-      if (chr == '-' || chr == '+') ++chidx;
+      if (ch_idx + 1 < bytes) chr = data[ch_idx + 1];
+      if (chr == '-' || chr == '+') ++ch_idx;
       decimal_found  = true;  // no decimal allowed in exponent
       exponent_found = true;  // no more exponents
       continue;
diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp
index 72863bdf23b..fe9cd41e780 100644
--- a/cpp/include/cudf/strings/strip.hpp
+++ b/cpp/include/cudf/strings/strip.hpp
@@ -31,9 +31,9 @@ namespace strings {
  * @brief Direction identifier for strip() function.
  */
 enum class strip_type {
-  LEFT,   //<< strip characters from the beginning of the string
-  RIGHT,  //<< strip characters from the end of the string
-  BOTH    //<< strip characters from the beginning and end of the string
+  LEFT,   ///< strip characters from the beginning of the string
+  RIGHT,  ///< strip characters from the end of the string
+  BOTH    ///< strip characters from the beginning and end of the string
 };
 
 /**
diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp
index e014f88c451..0cbf6b22029 100644
--- a/cpp/include/cudf/strings/translate.hpp
+++ b/cpp/include/cudf/strings/translate.hpp
@@ -60,7 +60,10 @@ std::unique_ptr<column> translate(
 /**
  * @brief Removes or keeps the specified character ranges in cudf::strings::filter_characters
  */
-enum class filter_type : bool { KEEP, REMOVE };
+enum class filter_type : bool {
+  KEEP,   ///< All characters but those specified are removed
+  REMOVE  ///< Only the specified characters are removed
+};
 
 /**
  * @brief Removes ranges of characters from each string in a strings column.
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index c719c564a87..70ccac2f75d 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -67,7 +67,7 @@ __device__ weak_ordering compare_elements(Element lhs, Element rhs)
 }
 }  // namespace detail
 
-/*
+/**
  * @brief A specialization for floating-point `Element` type relational comparison
  * to derive the order of the elements with respect to `lhs`. Specialization is to
  * handle `nan` in the order shown below.
@@ -187,6 +187,7 @@ class element_equality_comparator {
    *
    * @param lhs_element_index The index of the first element
    * @param rhs_element_index The index of the second element
+   * @return True if both lhs and rhs element are both nulls and `nulls_are_equal` is true, or equal
    *
    */
   template <typename Element,
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 55e7bc84dbe..45e8ff1310c 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -54,12 +54,6 @@ std::unique_ptr<column> transform(
   bool is_ptx,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<column> generalized_masked_op(
-  table_view const& data_view,
-  std::string const& binary_udf,
-  data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
  * preserving existing null values and also returns new null_count.
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index e026ae9ac0f..13d5f8e06bc 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -236,6 +236,7 @@ enum class type_id : int32_t {
   LIST,                    ///< List elements
   DECIMAL32,               ///< Fixed-point type with int32_t
   DECIMAL64,               ///< Fixed-point type with int64_t
+  DECIMAL128,              ///< Fixed-point type with __int128_t
   STRUCT,                  ///< Struct elements
   // `NUM_TYPE_IDS` must be last!
   NUM_TYPE_IDS  ///< Total number of type ids
@@ -271,7 +272,7 @@ class data_type {
    */
   explicit data_type(type_id id, int32_t scale) : _id{id}, _fixed_point_scale{scale}
   {
-    assert(id == type_id::DECIMAL32 || id == type_id::DECIMAL64);
+    assert(id == type_id::DECIMAL32 || id == type_id::DECIMAL64 || id == type_id::DECIMAL128);
   }
 
   /**
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 254a7988e2e..36f08b7f23e 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,28 +28,28 @@ namespace cudf {
  */
 
 enum class unary_operator : int32_t {
-  SIN,         // < Trigonometric sine
-  COS,         // < Trigonometric cosine
-  TAN,         // < Trigonometric tangent
-  ARCSIN,      // < Trigonometric sine inverse
-  ARCCOS,      // < Trigonometric cosine inverse
-  ARCTAN,      // < Trigonometric tangent inverse
-  SINH,        // < Hyperbolic sine
-  COSH,        // < Hyperbolic cosine
-  TANH,        // < Hyperbolic tangent
-  ARCSINH,     // < Hyperbolic sine inverse
-  ARCCOSH,     // < Hyperbolic cosine inverse
-  ARCTANH,     // < Hyperbolic tangent inverse
-  EXP,         // < Exponential (base e, Euler number)
-  LOG,         // < Natural Logarithm (base e)
-  SQRT,        // < Square-root (x^0.5)
-  CBRT,        // < Cube-root (x^(1.0/3))
-  CEIL,        // < Smallest integer value not less than arg
-  FLOOR,       // < largest integer value not greater than arg
-  ABS,         // < Absolute value
-  RINT,        // < Rounds the floating-point argument arg to an integer value
-  BIT_INVERT,  // < Bitwise Not (~)
-  NOT,         // < Logical Not (!)
+  SIN,         ///< Trigonometric sine
+  COS,         ///< Trigonometric cosine
+  TAN,         ///< Trigonometric tangent
+  ARCSIN,      ///< Trigonometric sine inverse
+  ARCCOS,      ///< Trigonometric cosine inverse
+  ARCTAN,      ///< Trigonometric tangent inverse
+  SINH,        ///< Hyperbolic sine
+  COSH,        ///< Hyperbolic cosine
+  TANH,        ///< Hyperbolic tangent
+  ARCSINH,     ///< Hyperbolic sine inverse
+  ARCCOSH,     ///< Hyperbolic cosine inverse
+  ARCTANH,     ///< Hyperbolic tangent inverse
+  EXP,         ///< Exponential (base e, Euler number)
+  LOG,         ///< Natural Logarithm (base e)
+  SQRT,        ///< Square-root (x^0.5)
+  CBRT,        ///< Cube-root (x^(1.0/3))
+  CEIL,        ///< Smallest integer value not less than arg
+  FLOOR,       ///< largest integer value not greater than arg
+  ABS,         ///< Absolute value
+  RINT,        ///< Rounds the floating-point argument arg to an integer value
+  BIT_INVERT,  ///< Bitwise Not (~)
+  NOT,         ///< Logical Not (!)
 };
 
 /**
diff --git a/cpp/include/cudf/utilities/traits.cuh b/cpp/include/cudf/utilities/traits.cuh
new file mode 100644
index 00000000000..43587ffa583
--- /dev/null
+++ b/cpp/include/cudf/utilities/traits.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <cuda/std/atomic>
+
+namespace cudf {
+
+/**
+ * @addtogroup utility_types
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Indicates whether the type `T` has support for atomics
+ *
+ * @tparam T     The type to verify
+ * @return true  `T` has support for atomics
+ * @return false `T` no support for atomics
+ */
+template <typename T>
+constexpr inline bool has_atomic_support()
+{
+  return cuda::std::atomic<T>::is_always_lock_free;
+}
+
+struct has_atomic_support_impl {
+  template <typename T>
+  constexpr bool operator()()
+  {
+    return has_atomic_support<T>();
+  }
+};
+
+/**
+ * @brief Indicates whether `type` has support for atomics
+ *
+ * @param type   The `data_type` to verify
+ * @return true  `type` has support for atomics
+ * @return false `type` no support for atomics
+ */
+constexpr inline bool has_atomic_support(data_type type)
+{
+  return cudf::type_dispatcher(type, has_atomic_support_impl{});
+}
+
+/** @} */
+
+}  // namespace cudf
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 40a833112e1..d1bd3049ba3 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -177,7 +177,7 @@ inline bool is_equality_comparable(data_type type)
 template <typename T>
 constexpr inline bool is_numeric()
 {
-  return std::is_integral<T>::value or std::is_floating_point<T>::value;
+  return cuda::std::is_integral<T>() or std::is_floating_point<T>::value;
 }
 
 struct is_numeric_impl {
@@ -404,7 +404,8 @@ constexpr inline bool is_timestamp(data_type type)
 template <typename T>
 constexpr inline bool is_fixed_point()
 {
-  return std::is_same_v<numeric::decimal32, T> || std::is_same_v<numeric::decimal64, T>;
+  return std::is_same_v<numeric::decimal32, T> || std::is_same_v<numeric::decimal64, T> ||
+         std::is_same_v<numeric::decimal128, T>;
 }
 
 struct is_fixed_point_impl {
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 857ddafa82c..d7d38aba4f3 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -85,8 +85,9 @@ using id_to_type = typename id_to_type_impl<Id>::type;
 /**
  * @brief "Returns" the corresponding type that is stored on the device when using `cudf::column`
  *
- * For `decimal32`, the storage type is an `int32_t`.
- * For `decimal64`, the storage type is an `int64_t`.
+ * For `decimal32`,  the storage type is an `int32_t`.
+ * For `decimal64`,  the storage type is an `int64_t`.
+ * For `decimal128`, the storage type is an `__int128_t`.
  *
  * Use this "type function" with the `using` type alias:
  * @code
@@ -98,25 +99,11 @@ using id_to_type = typename id_to_type_impl<Id>::type;
 // clang-format off
 template <typename T>
 using device_storage_type_t =
-  std::conditional_t<std::is_same_v<numeric::decimal32, T>, int32_t,
-  std::conditional_t<std::is_same_v<numeric::decimal64, T>, int64_t, T>>;
+  std::conditional_t<std::is_same_v<numeric::decimal32,  T>, int32_t,
+  std::conditional_t<std::is_same_v<numeric::decimal64,  T>, int64_t,
+  std::conditional_t<std::is_same_v<numeric::decimal128, T>, __int128_t, T>>>;
 // clang-format on
 
-/**
- * @brief Returns the corresponding `type_id` of type stored on device for a given `type_id`
- *
- * @param id   The given `type_id`
- * @return     Corresponding `type_id` of type stored on device
- */
-inline type_id device_storage_type_id(type_id id)
-{
-  switch (id) {
-    case type_id::DECIMAL32: return type_id::INT32;
-    case type_id::DECIMAL64: return type_id::INT64;
-    default: return id;
-  }
-}
-
 /**
  * @brief Checks if `fixed_point`-like types have template type `T` matching the column's
  * stored type id
@@ -127,10 +114,24 @@ inline type_id device_storage_type_id(type_id id)
  * @return     `false` If T does not match the stored column `type_id`
  */
 template <typename T>
-bool type_id_matches_device_storage_type(type_id id)
+constexpr bool type_id_matches_device_storage_type(type_id id)
 {
   return (id == type_id::DECIMAL32 && std::is_same_v<T, int32_t>) ||
-         (id == type_id::DECIMAL64 && std::is_same_v<T, int64_t>) || id == type_to_id<T>();
+         (id == type_id::DECIMAL64 && std::is_same_v<T, int64_t>) ||
+         (id == type_id::DECIMAL128 && std::is_same_v<T, __int128_t>) || id == type_to_id<T>();
+}
+
+/**
+ * @brief Checks if `id` is fixed_point (DECIMAL32/64/128)
+ *
+ * @return    `true` if `id` is `DECIMAL32`, `DECIMAL64` or `DECIMAL128`
+ * @return    `false` otherwise
+ */
+constexpr bool is_fixed_point(cudf::type_id id)
+{
+  return id == type_id::DECIMAL32 or  //
+         id == type_id::DECIMAL64 or  //
+         id == type_id::DECIMAL128;
 }
 
 /**
@@ -188,6 +189,7 @@ CUDF_TYPE_MAPPING(dictionary32, type_id::DICTIONARY32);
 CUDF_TYPE_MAPPING(cudf::list_view, type_id::LIST);
 CUDF_TYPE_MAPPING(numeric::decimal32, type_id::DECIMAL32);
 CUDF_TYPE_MAPPING(numeric::decimal64, type_id::DECIMAL64);
+CUDF_TYPE_MAPPING(numeric::decimal128, type_id::DECIMAL128);
 CUDF_TYPE_MAPPING(cudf::struct_view, type_id::STRUCT);
 
 /**
@@ -221,6 +223,7 @@ MAP_NUMERIC_SCALAR(int8_t)
 MAP_NUMERIC_SCALAR(int16_t)
 MAP_NUMERIC_SCALAR(int32_t)
 MAP_NUMERIC_SCALAR(int64_t)
+MAP_NUMERIC_SCALAR(__int128_t)
 MAP_NUMERIC_SCALAR(uint8_t)
 MAP_NUMERIC_SCALAR(uint16_t)
 MAP_NUMERIC_SCALAR(uint32_t)
@@ -253,6 +256,12 @@ struct type_to_scalar_type_impl<numeric::decimal64> {
   using ScalarDeviceType = cudf::fixed_point_scalar_device_view<numeric::decimal64>;
 };
 
+template <>
+struct type_to_scalar_type_impl<numeric::decimal128> {
+  using ScalarType       = cudf::fixed_point_scalar<numeric::decimal128>;
+  using ScalarDeviceType = cudf::fixed_point_scalar_device_view<numeric::decimal128>;
+};
+
 template <>  // TODO: this is a temporary solution for make_pair_iterator
 struct type_to_scalar_type_impl<cudf::dictionary32> {
   using ScalarType       = cudf::numeric_scalar<int32_t>;
@@ -492,6 +501,9 @@ CUDF_HDFI constexpr decltype(auto) type_dispatcher(cudf::data_type dtype, Functo
     case type_id::DECIMAL64:
       return f.template operator()<typename IdTypeMap<type_id::DECIMAL64>::type>(
         std::forward<Ts>(args)...);
+    case type_id::DECIMAL128:
+      return f.template operator()<typename IdTypeMap<type_id::DECIMAL128>::type>(
+        std::forward<Ts>(args)...);
     case type_id::STRUCT:
       return f.template operator()<typename IdTypeMap<type_id::STRUCT>::type>(
         std::forward<Ts>(args)...);
@@ -519,7 +531,7 @@ template <typename T1>
 struct double_type_dispatcher_second_type {
 #pragma nv_exec_check_disable
   template <typename T2, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(F&& f, Ts&&... args) const
+  CUDF_HDFI decltype(auto) operator()(F&& f, Ts&&... args) const
   {
     return f.template operator()<T1, T2>(std::forward<Ts>(args)...);
   }
@@ -529,9 +541,7 @@ template <template <cudf::type_id> typename IdTypeMap>
 struct double_type_dispatcher_first_type {
 #pragma nv_exec_check_disable
   template <typename T1, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(cudf::data_type type2,
-                                                      F&& f,
-                                                      Ts&&... args) const
+  CUDF_HDFI decltype(auto) operator()(cudf::data_type type2, F&& f, Ts&&... args) const
   {
     return type_dispatcher<IdTypeMap>(type2,
                                       detail::double_type_dispatcher_second_type<T1>{},
@@ -556,10 +566,10 @@ struct double_type_dispatcher_first_type {
  */
 #pragma nv_exec_check_disable
 template <template <cudf::type_id> typename IdTypeMap = id_to_type_impl, typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE constexpr decltype(auto) double_type_dispatcher(cudf::data_type type1,
-                                                                          cudf::data_type type2,
-                                                                          F&& f,
-                                                                          Ts&&... args)
+CUDF_HDFI constexpr decltype(auto) double_type_dispatcher(cudf::data_type type1,
+                                                          cudf::data_type type2,
+                                                          F&& f,
+                                                          Ts&&... args)
 {
   return type_dispatcher<IdTypeMap>(type1,
                                     detail::double_type_dispatcher_first_type<IdTypeMap>{},
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 5fa07fd5568..e08bf6aa53a 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -50,6 +50,7 @@ class BaseFixture : public ::testing::Test {
   /**
    * @brief Returns pointer to `device_memory_resource` that should be used for
    * all tests inheriting from this fixture
+   * @return pointer to memory resource
    */
   rmm::mr::device_memory_resource* mr() { return _mr; }
 };
@@ -170,6 +171,7 @@ class UniformRandomGenerator {
 
   /**
    * @brief Returns the next random number.
+   * @return generated random number
    */
   template <typename TL = T, std::enable_if_t<!cudf::is_timestamp<TL>()>* = nullptr>
   T generate()
@@ -211,6 +213,7 @@ class TempDirTestEnvironment : public ::testing::Environment {
   /**
    * @brief Get a temporary filepath to use for the specified filename
    *
+   * @param filename name of the file to be placed in temporary directory.
    * @return std::string The temporary filepath
    */
   std::string get_temp_filepath(std::string filename) { return tmpdir.path() + filename; }
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 4bc48769592..ccfdde2270c 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -79,6 +79,7 @@ class column_wrapper {
 
   /**
    * @brief Releases internal unique_ptr to wrapped column
+   * @return unique_ptr to wrapped column
    */
   std::unique_ptr<cudf::column> release() { return std::move(wrapped); }
 
@@ -509,11 +510,10 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
   {
     CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");
 
-    auto const size         = cudf::distance(begin, end);
-    auto const elements     = thrust::host_vector<Rep>(begin, end);
-    auto const is_decimal32 = std::is_same_v<Rep, int32_t>;
-    auto const id           = is_decimal32 ? type_id::DECIMAL32 : type_id::DECIMAL64;
-    auto const data_type    = cudf::data_type{id, static_cast<int32_t>(scale)};
+    auto const size      = cudf::distance(begin, end);
+    auto const elements  = thrust::host_vector<Rep>(begin, end);
+    auto const id        = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
+    auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
 
     wrapped.reset(new cudf::column{
       data_type,
@@ -574,11 +574,10 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
   {
     CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");
 
-    auto const size         = cudf::distance(begin, end);
-    auto const elements     = thrust::host_vector<Rep>(begin, end);
-    auto const is_decimal32 = std::is_same_v<Rep, int32_t>;
-    auto const id           = is_decimal32 ? type_id::DECIMAL32 : type_id::DECIMAL64;
-    auto const data_type    = cudf::data_type{id, static_cast<int32_t>(scale)};
+    auto const size      = cudf::distance(begin, end);
+    auto const elements  = thrust::host_vector<Rep>(begin, end);
+    auto const id        = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
+    auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
 
     wrapped.reset(new cudf::column{
       data_type,
@@ -1042,11 +1041,13 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
 
   /**
    * @brief Access keys column view
+   * @return column_view to keys column
    */
   column_view keys() const { return cudf::dictionary_column_view{wrapped->view()}.keys(); }
 
   /**
    * @brief Access indices column view
+   * @return column_view to indices column
    */
   column_view indices() const { return cudf::dictionary_column_view{wrapped->view()}.indices(); }
 
@@ -1501,11 +1502,8 @@ class lists_column_wrapper : public detail::column_wrapper {
 
     // concatenate them together, skipping children that are null.
     std::vector<column_view> children;
-    thrust::copy_if(std::cbegin(cols),
-                    std::cend(cols),
-                    valids,  // stencil
-                    std::back_inserter(children),
-                    thrust::identity<bool>{});
+    thrust::copy_if(
+      std::cbegin(cols), std::cend(cols), valids, std::back_inserter(children), thrust::identity{});
 
     auto data = children.empty() ? cudf::empty_like(expected_hierarchy) : concatenate(children);
 
diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp
index 90bf0cd99dc..8e242e5a4f3 100644
--- a/cpp/include/cudf_test/file_utilities.hpp
+++ b/cpp/include/cudf_test/file_utilities.hpp
@@ -24,6 +24,10 @@
 
 #include <cudf/utilities/error.hpp>
 
+/**
+ * @brief RAII class for creating a temporary directory.
+ *
+ */
 class temp_directory {
   std::string _path;
 
@@ -49,5 +53,10 @@ class temp_directory {
     nftw(_path.c_str(), rm_files, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS);
   }
 
+  /**
+   * @brief Returns the path of the temporary directory
+   *
+   * @return string path of the temporary directory
+   */
   const std::string& path() const { return _path; }
 };
diff --git a/cpp/include/cudf_test/table_utilities.hpp b/cpp/include/cudf_test/table_utilities.hpp
index 831c9f5ac14..f2427c5b8c6 100644
--- a/cpp/include/cudf_test/table_utilities.hpp
+++ b/cpp/include/cudf_test/table_utilities.hpp
@@ -39,7 +39,7 @@ void expect_table_properties_equal(cudf::table_view lhs, cudf::table_view rhs);
  */
 void expect_tables_equal(cudf::table_view lhs, cudf::table_view rhs);
 
-/*
+/**
  * @brief Verifies the equivalency of two tables.
  *
  * Treats null elements as equivalent.  Columns that have nullability but no nulls,
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index 2a02caa0326..097d072a5b4 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -264,7 +264,8 @@ using ListTypes = cudf::test::Types<list_view>;
  * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::FixedPointTypes);
  * ```
  */
-using FixedPointTypes = cudf::test::Types<numeric::decimal32, numeric::decimal64>;
+using FixedPointTypes =
+  cudf::test::Types<numeric::decimal32, numeric::decimal64, numeric::decimal128>;
 
 /**
  * @brief Provides a list of all fixed-width element types for use in GTest
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 435ff3b5987..d0874b57c2d 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -22,7 +22,7 @@ include(rapids-find)
 
 project(
   CUDA_KAFKA
-  VERSION 21.12.00
+  VERSION 22.02.00
   LANGUAGES CXX
 )
 
diff --git a/cpp/scripts/run-clang-format.py b/cpp/scripts/run-clang-format.py
index 178bf2f0c78..a7c83da22c5 100755
--- a/cpp/scripts/run-clang-format.py
+++ b/cpp/scripts/run-clang-format.py
@@ -22,7 +22,7 @@
 import sys
 import tempfile
 
-EXPECTED_VERSION = "11.0.0"
+EXPECTED_VERSION = "11.1.0"
 VERSION_REGEX = re.compile(r"clang-format version ([0-9.]+)")
 # NOTE: populate this list with more top-level dirs as we add more of them to
 # the cudf repo
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
index 76de008b14a..9c981c6cdaa 100755
--- a/cpp/scripts/run-cmake-format.sh
+++ b/cpp/scripts/run-cmake-format.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# This script is a pre-commit hook that wraps cmakelang's cmake linters. The
+# This script is a wrapper for cmakelang that may be used with pre-commit. The
 # wrapping is necessary because RAPIDS libraries split configuration for
 # cmakelang linters between a local config file and a second config file that's
 # shared across all of RAPIDS via rapids-cmake. In order to keep it up to date
@@ -16,19 +16,33 @@
 # config file at a nonstandard location, they may do so by setting the
 # environment variable RAPIDS_CMAKE_FORMAT_FILE.
 # 
-# While this script can be invoked directly (but only from the repo root since
-# all paths are relative to that), it is advisable to instead use the
-# pre-commit hooks via
-# `pre-commit run (cmake-format)|(cmake-format)`.
+# This script can be invoked directly anywhere within the project repository.
+# Alternatively, it may be invoked as a pre-commit hook via
+# `pre-commit run (cmake-format)|(cmake-lint)`.
 #
 # Usage:
 # bash run-cmake-format.sh {cmake-format,cmake-lint} infile [infile ...]
 
-# Note that pre-commit always runs from the root of the repository, so relative
-# paths are automatically relative to the repo root.
+status=0
+if [ -z ${CUDF_ROOT:+PLACEHOLDER} ]; then
+    CUDF_BUILD_DIR=$(git rev-parse --show-toplevel 2>&1)/cpp/build
+    status=$?
+else
+    CUDF_BUILD_DIR=${CUDF_ROOT}
+fi
+
+if ! [ ${status} -eq 0 ]; then
+    if [[ ${CUDF_BUILD_DIR} == *"not a git repository"* ]]; then
+        echo "This script must be run inside the cudf repository, or the CUDF_ROOT environment variable must be set."
+    else
+        echo "Script failed with unknown error attempting to determine project root:"
+        echo ${CUDF_BUILD_DIR}
+    fi
+    exit 1
+fi
+
 DEFAULT_FORMAT_FILE_LOCATIONS=(
-  "cpp/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json" 
-  "${CUDF_ROOT:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
+  "${CUDF_BUILD_DIR:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
   "cpp/libcudf_kafka/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
 )
 
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 31bf9d65d56..c00b3d6db85 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -759,9 +759,9 @@ struct target_type_functor {
   template <typename Source, aggregation::Kind k>
   constexpr data_type operator()() const noexcept
   {
-    auto const id = type_to_id<target_type_t<Source, k>>();
-    return id == type_id::DECIMAL32 || id == type_id::DECIMAL64 ? data_type{id, type.scale()}
-                                                                : data_type{id};
+    using Type    = target_type_t<Source, k>;
+    auto const id = type_to_id<Type>();
+    return cudf::is_fixed_point<Type>() ? data_type{id, type.scale()} : data_type{id};
   }
 };
 
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index b9193345c94..02998b84ffd 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -27,9 +27,8 @@ void initialize_with_identity(mutable_table_view& table,
   // TODO: Initialize all the columns in a single kernel instead of invoking one
   // kernel per column
   for (size_type i = 0; i < table.num_columns(); ++i) {
-    auto col        = table.column(i);
-    auto const type = data_type{device_storage_type_id(col.type().id())};
-    dispatch_type_and_aggregation(type, aggs[i], identity_initializer{}, col, stream);
+    auto col = table.column(i);
+    dispatch_type_and_aggregation(col.type(), aggs[i], identity_initializer{}, col, stream);
   }
 }
 
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index b9ed95daf1b..7087b71a84e 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -18,7 +18,6 @@
  */
 
 #include "compiled/binary_ops.hpp"
-#include "jit/util.hpp"
 
 #include <jit_preprocessed_files/binaryop/jit/kernel.cu.jit.hpp>
 
@@ -126,113 +125,6 @@ bool is_same_scale_necessary(binary_operator op)
 }
 
 namespace jit {
-
-void binary_operation(mutable_column_view& out,
-                      column_view const& lhs,
-                      scalar const& rhs,
-                      binary_operator op,
-                      OperatorType op_type,
-                      rmm::cuda_stream_view stream)
-{
-  if (is_null_dependent(op)) {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_s_with_validity")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, op_type));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs),
-               out.null_mask(),
-               lhs.null_mask(),
-               lhs.offset(),
-               rhs.is_valid(stream));
-  } else {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_s")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),            // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, op_type));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs));
-  }
-}
-
-void binary_operation(mutable_column_view& out,
-                      column_view const& lhs,
-                      scalar const& rhs,
-                      binary_operator op,
-                      rmm::cuda_stream_view stream)
-{
-  return binary_operation(out, lhs, rhs, op, OperatorType::Direct, stream);
-}
-
-void binary_operation(mutable_column_view& out,
-                      scalar const& lhs,
-                      column_view const& rhs,
-                      binary_operator op,
-                      rmm::cuda_stream_view stream)
-{
-  return binary_operation(out, rhs, lhs, op, OperatorType::Reverse, stream);
-}
-
-void binary_operation(mutable_column_view& out,
-                      column_view const& lhs,
-                      column_view const& rhs,
-                      binary_operator op,
-                      rmm::cuda_stream_view stream)
-{
-  if (is_null_dependent(op)) {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_v_with_validity")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, OperatorType::Direct));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs),
-               out.null_mask(),
-               lhs.null_mask(),
-               rhs.offset(),
-               rhs.null_mask(),
-               rhs.offset());
-  } else {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),            // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, OperatorType::Direct));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs));
-  }
-}
-
 void binary_operation(mutable_column_view& out,
                       column_view const& lhs,
                       column_view const& rhs,
@@ -246,17 +138,15 @@ void binary_operation(mutable_column_view& out,
   std::string cuda_source =
     cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name);
 
-  std::string kernel_name =
-    jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
-      .instantiate(output_type_name,                                // list of template arguments
-                   cudf::jit::get_type_name(lhs.type()),
-                   cudf::jit::get_type_name(rhs.type()),
-                   get_operator_name(binary_operator::GENERIC_BINARY, OperatorType::Direct));
+  std::string kernel_name = jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")
+                              .instantiate(output_type_name,  // list of template arguments
+                                           cudf::jit::get_type_name(lhs.type()),
+                                           cudf::jit::get_type_name(rhs.type()),
+                                           std::string("cudf::binops::jit::UserDefinedOp"));
 
   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-    .get_kernel(
-      kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                  //
+    .get_kernel(kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})
+    ->configure_1d_max_occupancy(0, 0, 0, stream.value())
     ->launch(out.size(),
              cudf::jit::get_data_ptr(out),
              cudf::jit::get_data_ptr(lhs),
@@ -266,6 +156,22 @@ void binary_operation(mutable_column_view& out,
 
 // Compiled Binary operation
 namespace compiled {
+
+template <typename Lhs, typename Rhs>
+void fixed_point_binary_operation_validation(binary_operator op,
+                                             Lhs lhs,
+                                             Rhs rhs,
+                                             thrust::optional<cudf::data_type> output_type = {})
+{
+  CUDF_EXPECTS((is_fixed_point(lhs) or is_fixed_point(rhs)),
+               "One of the inputs must have fixed_point data_type.");
+  CUDF_EXPECTS(binops::is_supported_fixed_point_binop(op),
+               "Unsupported fixed_point binary operation");
+  if (output_type.has_value() and binops::is_comparison_binop(op))
+    CUDF_EXPECTS(output_type == cudf::data_type{type_id::BOOL8},
+                 "Comparison operations require boolean output type.");
+}
+
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
  * binary_operator, data_type, rmm::mr::device_memory_resource*)
@@ -291,6 +197,11 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
   if (not cudf::binops::compiled::is_supported_operation(output_type, lhs.type(), rhs.type(), op))
     CUDF_FAIL("Unsupported operator for these types");
 
+  if (cudf::is_fixed_point(lhs.type()) or cudf::is_fixed_point(rhs.type())) {
+    cudf::binops::compiled::fixed_point_binary_operation_validation(
+      op, lhs.type(), rhs.type(), output_type);
+  }
+
   auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   if constexpr (std::is_same_v<LhsType, column_view>)
@@ -309,7 +220,6 @@ namespace detail {
 
 // There are 3 overloads of each of the following functions:
 // - `make_fixed_width_column_for_output`
-// - `fixed_point_binary_operation`
 // - `binary_operation`
 
 // The overloads are overloaded on the first two parameters of each function:
@@ -398,357 +308,6 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
   }
 };
 
-template <typename Lhs, typename Rhs>
-void fixed_point_binary_operation_validation(binary_operator op,
-                                             Lhs lhs,
-                                             Rhs rhs,
-                                             thrust::optional<cudf::data_type> output_type = {})
-{
-  CUDF_EXPECTS(is_fixed_point(lhs), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(is_fixed_point(rhs), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(binops::is_supported_fixed_point_binop(op),
-               "Unsupported fixed_point binary operation");
-  CUDF_EXPECTS(lhs.id() == rhs.id(), "Data type mismatch");
-  if (output_type.has_value()) {
-    if (binops::is_comparison_binop(op))
-      CUDF_EXPECTS(output_type == cudf::data_type{type_id::BOOL8},
-                   "Comparison operations require boolean output type.");
-    else
-      CUDF_EXPECTS(is_fixed_point(output_type.value()),
-                   "fixed_point binary operations require fixed_point output type.");
-  }
-}
-
-namespace jit {
-/**
- * @brief Function to compute binary operation of one `column_view` and one `scalar`
- *
- * @param lhs Left-hand side `scalar` used in the binary operation
- * @param rhs Right-hand side `column_view` used in the binary operation
- * @param op `binary_operator` to be used to combine `lhs` and `rhs`
- * @param stream CUDA stream used for device memory operations
- * @param mr Device memory resource to use for device memory allocation
- * @return std::unique_ptr<column> Resulting output column from the binary operation
- */
-std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
-                                                     column_view const& rhs,
-                                                     binary_operator op,
-                                                     cudf::data_type output_type,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
-{
-  using namespace numeric;
-
-  fixed_point_binary_operation_validation(op, lhs.type(), rhs.type(), output_type);
-
-  if (rhs.is_empty())
-    return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
-                                                     : cudf::data_type{rhs.type().id(), scale};
-  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view    = out->mutable_view();
-
-  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
-    // Adjust scalar/column so they have they same scale
-    if (rhs.type().scale() < lhs.type().scale()) {
-      auto const diff = lhs.type().scale() - rhs.type().scale();
-      if (lhs.type().id() == type_id::DECIMAL32) {
-        auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal32> const&>(lhs).value(stream);
-        auto const scale  = scale_type{rhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal32>(val * factor, scale);
-        binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
-      } else {
-        CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-        auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal64> const&>(lhs).value(stream);
-        auto const scale  = scale_type{rhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal64>(val * factor, scale);
-        binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
-      }
-    } else {
-      auto const diff   = rhs.type().scale() - lhs.type().scale();
-      auto const result = [&] {
-        if (lhs.type().id() == type_id::DECIMAL32) {
-          auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-          auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        }
-      }();
-      binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
-    }
-  } else {
-    binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  }
-  return output_type.scale() != scale ? cudf::cast(out_view, output_type) : std::move(out);
-}
-
-/**
- * @brief Function to compute binary operation of one `column_view` and one `scalar`
- *
- * @param lhs Left-hand side `column_view` used in the binary operation
- * @param rhs Right-hand side `scalar` used in the binary operation
- * @param op `binary_operator` to be used to combine `lhs` and `rhs`
- * @param stream CUDA stream used for device memory operations
- * @param mr Device memory resource to use for device memory allocation
- * @return std::unique_ptr<column> Resulting output column from the binary operation
- */
-std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
-                                                     scalar const& rhs,
-                                                     binary_operator op,
-                                                     cudf::data_type output_type,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
-{
-  using namespace numeric;
-
-  fixed_point_binary_operation_validation(op, lhs.type(), rhs.type(), output_type);
-
-  if (lhs.is_empty())
-    return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
-                                                     : cudf::data_type{lhs.type().id(), scale};
-  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view    = out->mutable_view();
-
-  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
-    // Adjust scalar/column so they have they same scale
-    if (rhs.type().scale() > lhs.type().scale()) {
-      auto const diff = rhs.type().scale() - lhs.type().scale();
-      if (rhs.type().id() == type_id::DECIMAL32) {
-        auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal32> const&>(rhs).value(stream);
-        auto const scale  = scale_type{lhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal32>(val * factor, scale);
-        binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
-      } else {
-        CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-        auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal64> const&>(rhs).value(stream);
-        auto const scale  = scale_type{rhs.type().scale()};
-        auto const scalar = make_fixed_point_scalar<decimal64>(val * factor, scale);
-        binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
-      }
-    } else {
-      auto const diff   = lhs.type().scale() - rhs.type().scale();
-      auto const result = [&] {
-        if (rhs.type().id() == type_id::DECIMAL32) {
-          auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-          auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        }
-      }();
-      binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
-    }
-  } else {
-    binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  }
-  return output_type.scale() != scale ? cudf::cast(out_view, output_type) : std::move(out);
-}
-
-/**
- * @brief Function to compute binary operation of two `column_view`s
- *
- * @param lhs Left-hand side `column_view` used in the binary operation
- * @param rhs Right-hand side `column_view` used in the binary operation
- * @param op `binary_operator` to be used to combine `lhs` and `rhs`
- * @param stream CUDA stream used for device memory operations
- * @param mr Device memory resource to use for device memory allocation
- * @return std::unique_ptr<column> Resulting output column from the binary operation
- */
-std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
-                                                     column_view const& rhs,
-                                                     binary_operator op,
-                                                     cudf::data_type output_type,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
-{
-  using namespace numeric;
-
-  fixed_point_binary_operation_validation(op, lhs.type(), rhs.type(), output_type);
-
-  if (lhs.is_empty() or rhs.is_empty())
-    return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
-                                                     : cudf::data_type{lhs.type().id(), scale};
-  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view    = out->mutable_view();
-
-  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
-    if (rhs.type().scale() < lhs.type().scale()) {
-      auto const diff   = lhs.type().scale() - rhs.type().scale();
-      auto const result = [&] {
-        if (lhs.type().id() == type_id::DECIMAL32) {
-          auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-          auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
-        }
-      }();
-      binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
-    } else {
-      auto const diff   = rhs.type().scale() - lhs.type().scale();
-      auto const result = [&] {
-        if (lhs.type().id() == type_id::DECIMAL32) {
-          auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        } else {
-          CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-          auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
-        }
-      }();
-      binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
-    }
-  } else {
-    binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  }
-  return output_type.scale() != scale ? cudf::cast(out_view, output_type) : std::move(out);
-}
-
-std::unique_ptr<column> binary_operation(scalar const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  // calls compiled ops for string types
-  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-    return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  // Check for datatype
-  CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
-  CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
-
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  if (rhs.is_empty()) return out;
-
-  auto out_view = out->mutable_view();
-  binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  return out;
-}
-
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         scalar const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  // calls compiled ops for string types
-  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-    return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  // Check for datatype
-  CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
-  CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
-
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  if (lhs.is_empty()) return out;
-
-  auto out_view = out->mutable_view();
-  binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  return out;
-}
-
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
-
-  // calls compiled ops for string types
-  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-    return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  // Check for datatype
-  CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
-  CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
-
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  if (lhs.is_empty() or rhs.is_empty()) return out;
-
-  auto out_view = out->mutable_view();
-  binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  return out;
-}
-}  // namespace jit
-}  // namespace detail
-
-namespace jit {
-std::unique_ptr<column> binary_operation(scalar const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         scalar const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-}  // namespace jit
-
-namespace detail {
 std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
@@ -827,7 +386,7 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
                                                          cudf::data_type const& lhs,
                                                          cudf::data_type const& rhs)
 {
-  cudf::detail::fixed_point_binary_operation_validation(op, lhs, rhs);
+  cudf::binops::compiled::fixed_point_binary_operation_validation(op, lhs, rhs);
 
   auto const scale = binary_operation_fixed_point_scale(op, lhs.scale(), rhs.scale());
   return cudf::data_type{lhs.id(), scale};
diff --git a/cpp/src/binaryop/compiled/NullEquals.cu b/cpp/src/binaryop/compiled/NullEquals.cu
new file mode 100644
index 00000000000..3fc76e804f7
--- /dev/null
+++ b/cpp/src/binaryop/compiled/NullEquals.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::NullEquals>(mutable_column_device_view&,
+                                               column_device_view const&,
+                                               column_device_view const&,
+                                               bool is_lhs_scalar,
+                                               bool is_rhs_scalar,
+                                               rmm::cuda_stream_view);
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 7b0139a0082..71d9b615153 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -316,7 +316,6 @@ case binary_operator::PYMOD:                apply_binary_op<ops::PyMod>(out, lhs
 case binary_operator::POW:                  apply_binary_op<ops::Pow>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::EQUAL:
 case binary_operator::NOT_EQUAL:
-case binary_operator::NULL_EQUALS:
 if(out.type().id() != type_id::BOOL8) CUDF_FAIL("Output type of Comparison operator should be bool type");
 dispatch_equality_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, op, stream); break;
 case binary_operator::LESS:                 apply_binary_op<ops::Less>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
@@ -337,6 +336,7 @@ case binary_operator::SHIFT_RIGHT_UNSIGNED: apply_binary_op<ops::ShiftRightUnsig
 case binary_operator::LOG_BASE:             apply_binary_op<ops::LogBase>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::ATAN2:                apply_binary_op<ops::ATan2>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::PMOD:                 apply_binary_op<ops::PMod>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::NULL_EQUALS:          apply_binary_op<ops::NullEquals>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MAX:             apply_binary_op<ops::NullMax>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MIN:             apply_binary_op<ops::NullMin>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 default:;
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 84147fc9220..5bd639dd295 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -117,7 +117,7 @@ struct ops_wrapper {
         } else {
           return BinaryOperator{}.template operator()<TypeCommon, TypeCommon>(x, y);
         }
-        // To supress nvcc warning
+        // To suppress nvcc warning
         return std::invoke_result_t<BinaryOperator, TypeCommon, TypeCommon>{};
       }();
       if constexpr (is_bool_result<BinaryOperator, TypeCommon, TypeCommon>())
@@ -164,7 +164,7 @@ struct ops2_wrapper {
         } else {
           return BinaryOperator{}.template operator()<TypeLhs, TypeRhs>(x, y);
         }
-        // To supress nvcc warning
+        // To suppress nvcc warning
         return std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>{};
       }();
       if constexpr (is_bool_result<BinaryOperator, TypeLhs, TypeRhs>())
@@ -177,35 +177,51 @@ struct ops2_wrapper {
 };
 
 /**
- * @brief Functor which does single, and double type dispatcher in device code
+ * @brief Functor which does single type dispatcher in device code
  *
  * single type dispatcher for lhs and rhs with common types.
+ *
+ * @tparam BinaryOperator binary operator functor
+ */
+template <class BinaryOperator>
+struct binary_op_device_dispatcher {
+  data_type common_data_type;
+  mutable_column_device_view out;
+  column_device_view lhs;
+  column_device_view rhs;
+  bool is_lhs_scalar;
+  bool is_rhs_scalar;
+
+  __forceinline__ __device__ void operator()(size_type i)
+  {
+    type_dispatcher(common_data_type,
+                    ops_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
+                    i);
+  }
+};
+
+/**
+ * @brief Functor which does double type dispatcher in device code
+ *
  * double type dispatcher for lhs and rhs without common types.
  *
  * @tparam BinaryOperator binary operator functor
  */
 template <class BinaryOperator>
-struct device_type_dispatcher {
+struct binary_op_double_device_dispatcher {
   mutable_column_device_view out;
   column_device_view lhs;
   column_device_view rhs;
   bool is_lhs_scalar;
   bool is_rhs_scalar;
-  std::optional<data_type> common_data_type;
 
-  __device__ void operator()(size_type i)
+  __forceinline__ __device__ void operator()(size_type i)
   {
-    if (common_data_type) {
-      type_dispatcher(*common_data_type,
-                      ops_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
-                      i);
-    } else {
-      double_type_dispatcher(
-        lhs.type(),
-        rhs.type(),
-        ops2_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
-        i);
-    }
+    double_type_dispatcher(
+      lhs.type(),
+      rhs.type(),
+      ops2_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
+      i);
   }
 };
 
@@ -263,10 +279,19 @@ void apply_binary_op(mutable_column_device_view& outd,
   auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
 
   // Create binop functor instance
-  auto binop_func = device_type_dispatcher<BinaryOperator>{
-    outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype};
-  // Execute it on every element
-  for_each(stream, outd.size(), binop_func);
+  if (common_dtype) {
+    // Execute it on every element
+    for_each(stream,
+             outd.size(),
+             binary_op_device_dispatcher<BinaryOperator>{
+               *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+  } else {
+    // Execute it on every element
+    for_each(stream,
+             outd.size(),
+             binary_op_double_device_dispatcher<BinaryOperator>{
+               outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+  }
 }
 
 }  // namespace compiled
diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu
index feee310716a..03c3e373476 100644
--- a/cpp/src/binaryop/compiled/equality_ops.cu
+++ b/cpp/src/binaryop/compiled/equality_ops.cu
@@ -28,19 +28,32 @@ void dispatch_equality_op(mutable_column_device_view& outd,
   auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
 
   // Execute it on every element
-  for_each(
-    stream,
-    outd.size(),
-    [op, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype] __device__(size_type i) {
-      // clang-format off
-      // Similar enabled template types should go together (better performance)
-      switch (op) {
-      case binary_operator::EQUAL:         device_type_dispatcher<ops::Equal>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
-      case binary_operator::NOT_EQUAL:     device_type_dispatcher<ops::NotEqual>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
-      case binary_operator::NULL_EQUALS:   device_type_dispatcher<ops::NullEquals>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
-      default:;
-      }
-      // clang-format on
-    });
+
+  if (common_dtype) {
+    if (op == binary_operator::EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_device_dispatcher<ops::Equal>{
+                 *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    } else if (op == binary_operator::NOT_EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_device_dispatcher<ops::NotEqual>{
+                 *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    }
+  } else {
+    if (op == binary_operator::EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_double_device_dispatcher<ops::Equal>{
+                 outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    } else if (op == binary_operator::NOT_EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_double_device_dispatcher<ops::NotEqual>{
+                 outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    }
+  }
 }
+
 }  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu
index fcfe16f979d..c9cc61a4f34 100644
--- a/cpp/src/binaryop/jit/kernel.cu
+++ b/cpp/src/binaryop/jit/kernel.cu
@@ -18,66 +18,29 @@
  * limitations under the License.
  */
 
-#include <binaryop/jit/operation.hpp>
-
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
+#include <binaryop/jit/operation-udf.hpp>
+#include <cuda/std/type_traits>
+
 namespace cudf {
 namespace binops {
 namespace jit {
 
-template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-__global__ void kernel_v_s_with_validity(cudf::size_type size,
-                                         TypeOut* out_data,
-                                         TypeLhs* lhs_data,
-                                         TypeRhs* rhs_data,
-                                         cudf::bitmask_type* output_mask,
-                                         cudf::bitmask_type const* mask,
-                                         cudf::size_type offset,
-                                         bool scalar_valid)
-{
-  int tid    = threadIdx.x;
-  int blkid  = blockIdx.x;
-  int blksz  = blockDim.x;
-  int gridsz = gridDim.x;
-
-  int start = tid + blkid * blksz;
-  int step  = blksz * gridsz;
-
-  for (cudf::size_type i = start; i < size; i += step) {
-    bool output_valid = false;
-    out_data[i]       = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(
-      lhs_data[i],
-      rhs_data[0],
-      mask ? cudf::bit_is_set(mask, offset + i) : true,
-      scalar_valid,
-      output_valid);
-    if (output_mask && !output_valid) cudf::clear_bit(output_mask, i);
-  }
-}
-
-template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-__global__ void kernel_v_s(cudf::size_type size,
-                           TypeOut* out_data,
-                           TypeLhs* lhs_data,
-                           TypeRhs* rhs_data)
-{
-  int tid    = threadIdx.x;
-  int blkid  = blockIdx.x;
-  int blksz  = blockDim.x;
-  int gridsz = gridDim.x;
-
-  int start = tid + blkid * blksz;
-  int step  = blksz * gridsz;
-
-  for (cudf::size_type i = start; i < size; i += step) {
-    out_data[i] = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(lhs_data[i], rhs_data[0]);
+struct UserDefinedOp {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    TypeOut output;
+    using TypeCommon = typename cuda::std::common_type<TypeOut, TypeLhs, TypeRhs>::type;
+    GENERIC_BINARY_OP(&output, static_cast<TypeCommon>(x), static_cast<TypeCommon>(y));
+    return output;
   }
-}
+};
 
 template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
 __global__ void kernel_v_v(cudf::size_type size,
diff --git a/cpp/src/binaryop/jit/operation.hpp b/cpp/src/binaryop/jit/operation.hpp
deleted file mode 100644
index d117f2182f9..00000000000
--- a/cpp/src/binaryop/jit/operation.hpp
+++ /dev/null
@@ -1,646 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/types.hpp>
-
-#include <binaryop/jit/operation-udf.hpp>
-#include <binaryop/jit/traits.hpp>
-
-#include <cmath>
-
-#include <cuda/std/type_traits>
-
-#pragma once
-
-using namespace cuda::std;
-
-namespace cudf {
-namespace binops {
-namespace jit {
-
-struct Add {
-  // Allow sum between chronos only when both input and output types
-  // are chronos. Unsupported combinations will fail to compile
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(is_chrono_v<TypeOut> && is_chrono_v<TypeLhs> && is_chrono_v<TypeRhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return x + y;
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(!is_chrono_v<TypeOut> || !is_chrono_v<TypeLhs> ||
-                         !is_chrono_v<TypeRhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) + static_cast<TypeCommon>(y));
-  }
-};
-
-using RAdd = Add;
-
-struct Sub {
-  // Allow difference between chronos only when both input and output types
-  // are chronos. Unsupported combinations will fail to compile
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(is_chrono_v<TypeOut> && is_chrono_v<TypeLhs> && is_chrono_v<TypeRhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return x - y;
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(!is_chrono_v<TypeOut> || !is_chrono_v<TypeLhs> ||
-                         !is_chrono_v<TypeRhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) - static_cast<TypeCommon>(y));
-  }
-};
-
-struct RSub {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return Sub::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct Mul {
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(!is_duration_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) * static_cast<TypeCommon>(y));
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return DurationProduct<TypeOut>(x, y);
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeLhs> && is_integral_v<TypeRhs>) ||
-                        (is_integral_v<TypeLhs> && is_duration_v<TypeRhs>)>* = nullptr>
-  static TypeOut DurationProduct(TypeLhs x, TypeRhs y)
-  {
-    return x * y;
-  }
-};
-
-using RMul = Mul;
-
-struct Div {
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(!is_duration_v<TypeLhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) / static_cast<TypeCommon>(y));
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeLhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return DurationDivide<TypeOut>(x, y);
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_integral_v<TypeRhs> || is_duration_v<TypeRhs>)>* = nullptr>
-  static TypeOut DurationDivide(TypeLhs x, TypeRhs y)
-  {
-    return x / y;
-  }
-};
-
-struct RDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return Div::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct TrueDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<double>(x) / static_cast<double>(y));
-  }
-};
-
-struct RTrueDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return TrueDiv::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct FloorDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return floor(static_cast<double>(x) / static_cast<double>(y));
-  }
-};
-
-struct RFloorDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return FloorDiv::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct Mod {
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(is_integral_v<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) % static_cast<TypeCommon>(y));
-  }
-
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(isFloat<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return static_cast<TypeOut>(fmodf(static_cast<float>(x), static_cast<float>(y)));
-  }
-
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(isDouble<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return static_cast<TypeOut>(fmod(static_cast<double>(x), static_cast<double>(y)));
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeLhs> && is_duration_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return x % y;
-  }
-};
-
-struct RMod {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return Mod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct PyMod {
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_integral_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return ((x % y) + y) % y;
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_floating_point_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    double x1 = static_cast<double>(x);
-    double y1 = static_cast<double>(y);
-    return fmod(fmod(x1, y1) + y1, y1);
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeLhs> && is_duration_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return ((x % y) + y) % y;
-  }
-};
-
-struct RPyMod {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return PyMod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct Pow {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return pow(static_cast<double>(x), static_cast<double>(y));
-  }
-};
-
-struct RPow {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return Pow::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct Equal {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x == y);
-  }
-};
-
-using REqual = Equal;
-
-struct NotEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x != y);
-  }
-};
-
-using RNotEqual = NotEqual;
-
-struct Less {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x < y);
-  }
-};
-
-struct RLess {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y < x);
-  }
-};
-
-struct Greater {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x > y);
-  }
-};
-
-struct RGreater {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y > x);
-  }
-};
-
-struct LessEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x <= y);
-  }
-};
-
-struct RLessEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y <= x);
-  }
-};
-
-struct GreaterEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x >= y);
-  }
-};
-
-struct RGreaterEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y >= x);
-  }
-};
-
-struct BitwiseAnd {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<TypeOut>(x) & static_cast<TypeOut>(y));
-  }
-};
-
-using RBitwiseAnd = BitwiseAnd;
-
-struct BitwiseOr {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<TypeOut>(x) | static_cast<TypeOut>(y));
-  }
-};
-
-using RBitwiseOr = BitwiseOr;
-
-struct BitwiseXor {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<TypeOut>(x) ^ static_cast<TypeOut>(y));
-  }
-};
-
-using RBitwiseXor = BitwiseXor;
-
-struct LogicalAnd {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x && y);
-  }
-};
-
-using RLogicalAnd = LogicalAnd;
-
-struct LogicalOr {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x || y);
-  }
-};
-
-using RLogicalOr = LogicalOr;
-
-struct UserDefinedOp {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    TypeOut output;
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    GENERIC_BINARY_OP(&output, static_cast<TypeCommon>(x), static_cast<TypeCommon>(y));
-    return output;
-  }
-};
-
-struct ShiftLeft {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x << y);
-  }
-};
-
-struct RShiftLeft {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y << x);
-  }
-};
-
-struct ShiftRight {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x >> y);
-  }
-};
-
-struct RShiftRight {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y >> x);
-  }
-};
-
-struct ShiftRightUnsigned {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<make_unsigned_t<TypeLhs>>(x) >> y);
-  }
-};
-
-struct RShiftRightUnsigned {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<make_unsigned_t<TypeRhs>>(y) >> x);
-  }
-};
-
-struct LogBase {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (std::log(static_cast<double>(x)) / std::log(static_cast<double>(y)));
-  }
-};
-
-struct RLogBase {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return LogBase::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct NullEquals {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    output_valid = true;
-    if (!lhs_valid && !rhs_valid) return true;
-    if (lhs_valid && rhs_valid) return x == y;
-    return false;
-  }
-};
-
-struct RNullEquals {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    output_valid = true;
-    return NullEquals::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid, output_valid);
-  }
-};
-
-struct NullMax {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    output_valid = true;
-    if (!lhs_valid && !rhs_valid) {
-      output_valid = false;
-      return TypeOut{};
-    } else if (lhs_valid && rhs_valid) {
-      return (TypeOut{x} > TypeOut{y}) ? TypeOut{x} : TypeOut{y};
-    } else if (lhs_valid)
-      return TypeOut{x};
-    else
-      return TypeOut{y};
-  }
-};
-
-struct RNullMax {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    return NullMax::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid, output_valid);
-  }
-};
-
-struct NullMin {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    output_valid = true;
-    if (!lhs_valid && !rhs_valid) {
-      output_valid = false;
-      return TypeOut{};
-    } else if (lhs_valid && rhs_valid) {
-      return (TypeOut{x} < TypeOut{y}) ? TypeOut{x} : TypeOut{y};
-    } else if (lhs_valid)
-      return TypeOut{x};
-    else
-      return TypeOut{y};
-  }
-};
-
-struct RNullMin {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    return NullMin::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid, output_valid);
-  }
-};
-
-struct PMod {
-  // Ideally, these two specializations - one for integral types and one for non integral
-  // types shouldn't be required, as std::fmod should promote integral types automatically
-  // to double and call the std::fmod overload for doubles. Sadly, doing this in jitified
-  // code does not work - it is having trouble deciding between float/double overloads
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_integral_v<typename cuda::std::common_type<TypeLhs, TypeRhs>::type>)>* =
-              nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using common_t = typename cuda::std::common_type<TypeLhs, TypeRhs>::type;
-    common_t xconv{x};
-    common_t yconv{y};
-    auto rem = xconv % yconv;
-    if (rem < 0) rem = (rem + yconv) % yconv;
-    return TypeOut{rem};
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<
-              !(is_integral_v<typename cuda::std::common_type<TypeLhs, TypeRhs>::type>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using common_t = typename cuda::std::common_type<TypeLhs, TypeRhs>::type;
-    common_t xconv{x};
-    common_t yconv{y};
-    auto rem = std::fmod(xconv, yconv);
-    if (rem < 0) rem = std::fmod(rem + yconv, yconv);
-    return TypeOut{rem};
-  }
-};
-
-struct RPMod {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return PMod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct ATan2 {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return TypeOut{std::atan2(double{x}, double{y})};
-  }
-};
-
-struct RATan2 {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return TypeOut{ATan2::operate<TypeOut, TypeRhs, TypeLhs>(y, x)};
-  }
-};
-
-}  // namespace jit
-}  // namespace binops
-}  // namespace cudf
diff --git a/cpp/src/binaryop/jit/traits.hpp b/cpp/src/binaryop/jit/traits.hpp
deleted file mode 100644
index 1033d38a668..00000000000
--- a/cpp/src/binaryop/jit/traits.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-// Include Jitify's cstddef header first
-#include <cstddef>
-
-#include <cuda/std/climits>
-#include <cuda/std/cstddef>
-#include <cuda/std/limits>
-#include <cuda/std/type_traits>
-
-#include <cudf/wrappers/durations.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-
-namespace cudf {
-namespace binops {
-namespace jit {
-
-// -------------------------------------------------------------------------
-// type_traits cannot tell the difference between float and double
-template <typename Type>
-constexpr bool isFloat = false;
-
-template <typename T>
-constexpr bool is_timestamp_v =
-  cuda::std::is_same_v<cudf::timestamp_D, T> || cuda::std::is_same_v<cudf::timestamp_s, T> ||
-  cuda::std::is_same_v<cudf::timestamp_ms, T> || cuda::std::is_same_v<cudf::timestamp_us, T> ||
-  cuda::std::is_same_v<cudf::timestamp_ns, T>;
-
-template <typename T>
-constexpr bool is_duration_v =
-  cuda::std::is_same_v<cudf::duration_D, T> || cuda::std::is_same_v<cudf::duration_s, T> ||
-  cuda::std::is_same_v<cudf::duration_ms, T> || cuda::std::is_same_v<cudf::duration_us, T> ||
-  cuda::std::is_same_v<cudf::duration_ns, T>;
-
-template <typename T>
-constexpr bool is_chrono_v = is_timestamp_v<T> || is_duration_v<T>;
-
-template <>
-constexpr bool isFloat<float> = true;
-
-template <typename Type>
-constexpr bool isDouble = false;
-
-template <>
-constexpr bool isDouble<double> = true;
-
-}  // namespace jit
-}  // namespace binops
-}  // namespace cudf
diff --git a/cpp/src/binaryop/jit/util.hpp b/cpp/src/binaryop/jit/util.hpp
deleted file mode 100644
index 34c42e28a8b..00000000000
--- a/cpp/src/binaryop/jit/util.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/binaryop.hpp>
-
-#include <string>
-
-namespace cudf {
-namespace binops {
-namespace jit {
-
-/**
- * @brief Orientation of lhs and rhs in operator
- */
-enum class OperatorType {
-  Direct,  ///< Orientation of operands is op(lhs, rhs)
-  Reverse  ///< Orientation of operands is op(rhs, lhs)
-};
-
-/**
- * @brief Get the Operator Name
- *
- * @param op The binary operator as enum of type cudf::binary_operator
- * @param type @see OperatorType
- * @return std::string The name of the operator as string
- */
-std::string inline get_operator_name(binary_operator op, OperatorType type)
-{
-  std::string const operator_name = [op] {
-    // clang-format off
-    switch (op) {
-      case binary_operator::ADD:                  return "Add";
-      case binary_operator::SUB:                  return "Sub";
-      case binary_operator::MUL:                  return "Mul";
-      case binary_operator::DIV:                  return "Div";
-      case binary_operator::TRUE_DIV:             return "TrueDiv";
-      case binary_operator::FLOOR_DIV:            return "FloorDiv";
-      case binary_operator::MOD:                  return "Mod";
-      case binary_operator::PYMOD:                return "PyMod";
-      case binary_operator::POW:                  return "Pow";
-      case binary_operator::EQUAL:                return "Equal";
-      case binary_operator::NOT_EQUAL:            return "NotEqual";
-      case binary_operator::LESS:                 return "Less";
-      case binary_operator::GREATER:              return "Greater";
-      case binary_operator::LESS_EQUAL:           return "LessEqual";
-      case binary_operator::GREATER_EQUAL:        return "GreaterEqual";
-      case binary_operator::BITWISE_AND:          return "BitwiseAnd";
-      case binary_operator::BITWISE_OR:           return "BitwiseOr";
-      case binary_operator::BITWISE_XOR:          return "BitwiseXor";
-      case binary_operator::LOGICAL_AND:          return "LogicalAnd";
-      case binary_operator::LOGICAL_OR:           return "LogicalOr";
-      case binary_operator::GENERIC_BINARY:       return "UserDefinedOp";
-      case binary_operator::SHIFT_LEFT:           return "ShiftLeft";
-      case binary_operator::SHIFT_RIGHT:          return "ShiftRight";
-      case binary_operator::SHIFT_RIGHT_UNSIGNED: return "ShiftRightUnsigned";
-      case binary_operator::LOG_BASE:             return "LogBase";
-      case binary_operator::ATAN2:                return "ATan2";
-      case binary_operator::PMOD:                 return "PMod";
-      case binary_operator::NULL_EQUALS:          return "NullEquals";
-      case binary_operator::NULL_MAX:             return "NullMax";
-      case binary_operator::NULL_MIN:             return "NullMin";
-      default:                                    return "";
-    }
-    // clang-format on
-  }();
-
-  if (operator_name == "") { return "None"; }
-
-  return "cudf::binops::jit::" +
-         (type == OperatorType::Direct ? operator_name : 'R' + operator_name);
-}
-
-}  // namespace jit
-}  // namespace binops
-}  // namespace cudf
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index f4b6a8bf5fd..34c0cea683e 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -79,7 +79,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     device_views.cend(),
     std::next(offsets.begin()),
     [](auto const& col) { return col.size(); },
-    thrust::plus<size_t>{});
+    thrust::plus{});
   auto d_offsets         = make_device_uvector_async(offsets, stream);
   auto const output_size = offsets.back();
 
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index ccfad56b4ea..717bd7ac0a8 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -54,6 +54,8 @@ enum class datetime_component {
   NANOSECOND
 };
 
+enum class rounding_kind { CEIL, FLOOR };
+
 template <datetime_component Component>
 struct extract_component_operator {
   template <typename Timestamp>
@@ -88,32 +90,59 @@ struct extract_component_operator {
   }
 };
 
-template <datetime_component COMPONENT>
-struct ceil_timestamp {
+// This functor takes the rounding type as runtime info and dispatches to the ceil/floor/round
+// function.
+template <typename DurationType>
+struct RoundFunctor {
+  template <typename Timestamp>
+  CUDA_DEVICE_CALLABLE auto operator()(rounding_kind round_kind, Timestamp dt)
+  {
+    switch (round_kind) {
+      case rounding_kind::CEIL: return cuda::std::chrono::ceil<DurationType>(dt);
+      case rounding_kind::FLOOR: return cuda::std::chrono::floor<DurationType>(dt);
+      default: cudf_assert(false && "Unsupported rounding kind.");
+    }
+    __builtin_unreachable();
+  }
+};
+
+struct RoundingDispatcher {
+  rounding_kind round_kind;
+  datetime_component component;
+
+  RoundingDispatcher(rounding_kind round_kind, datetime_component component)
+    : round_kind(round_kind), component(component)
+  {
+  }
+
   template <typename Timestamp>
   CUDA_DEVICE_CALLABLE Timestamp operator()(Timestamp const ts) const
   {
-    using namespace cuda::std::chrono;
-    // want to use this with D, H, T (minute), S, L (millisecond), U
-    switch (COMPONENT) {
+    switch (component) {
       case datetime_component::DAY:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_D>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_D>{}(round_kind, ts));
       case datetime_component::HOUR:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_h>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_h>{}(round_kind, ts));
       case datetime_component::MINUTE:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_m>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_m>{}(round_kind, ts));
       case datetime_component::SECOND:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_s>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_s>{}(round_kind, ts));
       case datetime_component::MILLISECOND:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_ms>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_ms>{}(round_kind, ts));
       case datetime_component::MICROSECOND:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_us>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_us>{}(round_kind, ts));
       case datetime_component::NANOSECOND:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_ns>(ts));
-      default: cudf_assert(false && "Unexpected resolution");
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_ns>{}(round_kind, ts));
+      default: cudf_assert(false && "Unsupported datetime rounding resolution.");
     }
-
-    return {};
+    __builtin_unreachable();
   }
 };
 
@@ -196,10 +225,11 @@ struct is_leap_year_op {
 };
 
 // Specific function for applying ceil/floor date ops
-template <typename TransformFunctor>
-struct dispatch_ceil {
+struct dispatch_round {
   template <typename Timestamp>
   std::enable_if_t<cudf::is_timestamp<Timestamp>(), std::unique_ptr<cudf::column>> operator()(
+    rounding_kind round_kind,
+    datetime_component component,
     cudf::column_view const& column,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr) const
@@ -221,7 +251,7 @@ struct dispatch_ceil {
                       column.begin<Timestamp>(),
                       column.end<Timestamp>(),
                       output->mutable_view().begin<Timestamp>(),
-                      TransformFunctor{});
+                      RoundingDispatcher{round_kind, component});
 
     return output;
   }
@@ -384,13 +414,14 @@ std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_colu
   }
 }
 
-template <datetime_component Component>
-std::unique_ptr<column> ceil_general(column_view const& column,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> round_general(rounding_kind round_kind,
+                                      datetime_component component,
+                                      column_view const& column,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   return cudf::type_dispatcher(
-    column.type(), dispatch_ceil<detail::ceil_timestamp<Component>>{}, column, stream, mr);
+    column.type(), dispatch_round{}, round_kind, component, column, stream, mr);
 }
 
 std::unique_ptr<column> extract_year(column_view const& column,
@@ -498,53 +529,147 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
 std::unique_ptr<column> ceil_day(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::DAY>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::DAY,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::HOUR>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::HOUR,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::MINUTE>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::MINUTE,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_second(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::SECOND>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::SECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_millisecond(column_view const& column,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::MILLISECOND>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::MILLISECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_microsecond(column_view const& column,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::MICROSECOND>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::MICROSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_nanosecond(column_view const& column,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::NANOSECOND>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::NANOSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_day(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::DAY,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::HOUR,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::MINUTE,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_second(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::SECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_millisecond(column_view const& column,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::MILLISECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_microsecond(column_view const& column,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::MICROSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_nanosecond(column_view const& column,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::NANOSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> extract_year(column_view const& column, rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 749a4d7940c..d17c698f91a 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -77,8 +77,7 @@ struct in_place_fill_range_dispatch {
     auto unscaled = static_cast<cudf::fixed_point_scalar<T> const&>(value).value(stream);
     using RepType = typename T::rep;
     auto s        = cudf::numeric_scalar<RepType>(unscaled, value.is_valid(stream));
-    auto view     = cudf::bit_cast(destination, s.type());
-    in_place_fill<RepType>(view, begin, end, s, stream);
+    in_place_fill<RepType>(destination, begin, end, s, stream);
   }
 
   template <typename T, typename... Args>
@@ -93,13 +92,15 @@ struct out_of_place_fill_range_dispatch {
   cudf::column_view const& input;
 
   template <typename T, typename... Args>
-  std::enable_if_t<not cudf::is_rep_layout_compatible<T>(), std::unique_ptr<cudf::column>>
+  std::enable_if_t<not cudf::is_rep_layout_compatible<T>() and not cudf::is_fixed_point<T>(),
+                   std::unique_ptr<cudf::column>>
   operator()(Args...)
   {
     CUDF_FAIL("Unsupported type in fill.");
   }
 
-  template <typename T, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
+  template <typename T,
+            CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>() or cudf::is_fixed_point<T>())>
   std::unique_ptr<cudf::column> operator()(
     cudf::size_type begin,
     cudf::size_type end,
@@ -116,8 +117,9 @@ struct out_of_place_fill_range_dispatch {
           0);
       }
 
-      auto ret_view = p_ret->mutable_view();
-      in_place_fill<T>(ret_view, begin, end, value, stream);
+      auto ret_view    = p_ret->mutable_view();
+      using DeviceType = cudf::device_storage_type_t<T>;
+      in_place_fill<DeviceType>(ret_view, begin, end, value, stream);
     }
 
     return p_ret;
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 6a9eaf0af90..e35fa36a289 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -42,6 +42,7 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/traits.cuh>
 #include <cudf/utilities/traits.hpp>
 #include <hash/concurrent_unordered_map.cuh>
 
@@ -51,6 +52,8 @@
 #include <unordered_set>
 #include <utility>
 
+#include <cuda/std/atomic>
+
 namespace cudf {
 namespace groupby {
 namespace detail {
@@ -634,9 +637,10 @@ bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request
 {
   auto const all_hash_aggregations =
     std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-      return std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
-        return is_hash_aggregation(a->kind);
-      });
+      return cudf::has_atomic_support(r.values.type()) and
+             std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
+               return is_hash_aggregation(a->kind);
+             });
     });
 
   // Currently, structs are not supported in any of hash-based aggregations.
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 234bb447761..d68b701d75f 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -559,7 +559,7 @@ auto column_view_with_common_nulls(column_view const& column_0, column_view cons
 }
 
 /**
- * @brief Perform covariance betweeen two child columns of non-nullable struct column.
+ * @brief Perform covariance between two child columns of non-nullable struct column.
  *
  */
 template <>
@@ -602,7 +602,7 @@ void aggregate_result_functor::operator()<aggregation::COVARIANCE>(aggregation c
 };
 
 /**
- * @brief Perform correlation betweeen two child columns of non-nullable struct column.
+ * @brief Perform correlation between two child columns of non-nullable struct column.
  *
  */
 template <>
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index 4e2a5b68abc..bde7c985df1 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -173,8 +173,8 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
 
   // Generate bitmask for the output.
   // Only mean and M2 values can be nullable. Count column must be non-nullable.
-  auto [null_mask, null_count] = cudf::detail::valid_if(
-    validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+  auto [null_mask, null_count] =
+    cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
   if (null_count > 0) {
     result_means->set_null_mask(null_mask, null_count);           // copy null_mask
     result_M2s->set_null_mask(std::move(null_mask), null_count);  // take over null_mask
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 935ef9554a9..f36bdc0a660 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -79,7 +79,7 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                 group_labels.end(),
                                 mutable_ranks.begin<size_type>(),
                                 mutable_ranks.begin<size_type>(),
-                                thrust::equal_to<size_type>{},
+                                thrust::equal_to{},
                                 scan_op);
 
   return ranks;
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 013ea924cce..e25fdd6fc27 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <groupby/sort/group_util.cuh>
+#include <reductions/arg_minmax_util.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -115,7 +115,7 @@ struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
@@ -160,7 +160,7 @@ struct group_scan_functor<K,
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
@@ -214,30 +214,38 @@ struct group_scan_functor<K,
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
     // Find the indices of the prefix min/max elements within each group.
     auto const count_iter = thrust::make_counting_iterator<size_type>(0);
     if (values.has_nulls()) {
-      auto const binop = row_arg_minmax_fn<true>(values.size(),
-                                                 *d_flattened_values_ptr,
-                                                 flattened_null_precedences.data(),
-                                                 K == aggregation::MIN);
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn<true>(values.size(),
+                                                         *d_flattened_values_ptr,
+                                                         flattened_null_precedences.data(),
+                                                         K == aggregation::MIN);
       do_scan(count_iter, map_begin, binop);
     } else {
-      auto const binop = row_arg_minmax_fn<false>(values.size(),
-                                                  *d_flattened_values_ptr,
-                                                  flattened_null_precedences.data(),
-                                                  K == aggregation::MIN);
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn<false>(values.size(),
+                                                          *d_flattened_values_ptr,
+                                                          flattened_null_precedences.data(),
+                                                          K == aggregation::MIN);
       do_scan(count_iter, map_begin, binop);
     }
 
     auto gather_map_view =
       column_view(data_type{type_to_id<offset_type>()}, gather_map.size(), gather_map.data());
 
+    //
     // Gather the children elements of the prefix min/max struct elements first.
+    //
+    // Typically, we should use `get_sliced_child` for each child column to properly handle the
+    // input if it is a sliced view. However, since the input to this function is just generated
+    // from groupby internal APIs which is never a sliced view, we just use `child_begin` and
+    // `child_end` iterators for simplicity.
     auto scanned_children =
       cudf::detail::gather(
         table_view(std::vector<column_view>{values.child_begin(), values.child_end()}),
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 4e0820af236..95a36f40e57 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <groupby/sort/group_util.cuh>
+#include <reductions/arg_minmax_util.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -191,7 +191,7 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
                             inp_iter,
                             thrust::make_discard_iterator(),
                             out_iter,
-                            thrust::equal_to<size_type>{},
+                            thrust::equal_to{},
                             binop);
     };
 
@@ -215,10 +215,10 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
       rmm::device_uvector<bool> validity(num_groups, stream);
       do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
                    validity.begin(),
-                   thrust::logical_or<bool>{});
+                   thrust::logical_or{});
 
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+      auto [null_mask, null_count] =
+        cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
     }
     return result;
@@ -264,17 +264,18 @@ struct group_reduction_functor<
                             inp_iter,
                             thrust::make_discard_iterator(),
                             out_iter,
-                            thrust::equal_to<size_type>{},
+                            thrust::equal_to{},
                             binop);
     };
 
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
     if (values.has_nulls()) {
-      auto const binop = row_arg_minmax_fn<true>(values.size(),
-                                                 *d_flattened_values_ptr,
-                                                 flattened_null_precedences.data(),
-                                                 K == aggregation::ARGMIN);
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn<true>(values.size(),
+                                                         *d_flattened_values_ptr,
+                                                         flattened_null_precedences.data(),
+                                                         K == aggregation::ARGMIN);
       do_reduction(count_iter, result_begin, binop);
 
       // Generate bitmask for the output by segmented reduction of the input bitmask.
@@ -282,16 +283,17 @@ struct group_reduction_functor<
       auto validity           = rmm::device_uvector<bool>(num_groups, stream);
       do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
                    validity.begin(),
-                   thrust::logical_or<bool>{});
+                   thrust::logical_or{});
 
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+      auto [null_mask, null_count] =
+        cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
     } else {
-      auto const binop = row_arg_minmax_fn<false>(values.size(),
-                                                  *d_flattened_values_ptr,
-                                                  flattened_null_precedences.data(),
-                                                  K == aggregation::ARGMIN);
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn<false>(values.size(),
+                                                          *d_flattened_values_ptr,
+                                                          flattened_null_precedences.data(),
+                                                          K == aggregation::ARGMIN);
       do_reduction(count_iter, result_begin, binop);
     }
 
diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
index 146a6a8c31c..551eb128231 100644
--- a/cpp/src/groupby/sort/group_tdigest.cu
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -625,7 +625,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                         centroids_begin,                  // values
                         thrust::make_discard_iterator(),  // key output
                         output,                           // output
-                        thrust::equal_to<size_type>{},    // key equality check
+                        thrust::equal_to{},               // key equality check
                         merge_centroids{});
 
   // create final tdigest column
@@ -850,8 +850,8 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                         min_iter,
                         thrust::make_discard_iterator(),
                         merged_min_col->mutable_view().begin<double>(),
-                        thrust::equal_to<size_type>{},  // key equality check
-                        thrust::minimum<double>{});
+                        thrust::equal_to{},  // key equality check
+                        thrust::minimum{});
 
   auto merged_max_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -864,8 +864,8 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                         max_iter,
                         thrust::make_discard_iterator(),
                         merged_max_col->mutable_view().begin<double>(),
-                        thrust::equal_to<size_type>{},  // key equality check
-                        thrust::maximum<double>{});
+                        thrust::equal_to{},  // key equality check
+                        thrust::maximum{});
 
   // for any empty groups, set the min and max to be 0. not technically necessary but it makes
   // testing simpler.
diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp
new file mode 100644
index 00000000000..d42ddf3817c
--- /dev/null
+++ b/cpp/src/io/csv/durations.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace io {
+namespace detail {
+namespace csv {
+
+std::unique_ptr<column> pandas_format_durations(
+  column_view const& durations,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace csv
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index e8c673751db..b9b6fc6cf94 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -19,17 +19,25 @@
  * @brief cuDF-IO CSV writer class implementation
  */
 
-#include "writer_impl.hpp"
+#include "durations.hpp"
+
+#include "csv_common.h"
+#include "csv_gpu.h"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/detail/csv.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -40,13 +48,19 @@
 #include <thrust/scan.h>
 
 #include <algorithm>
+#include <memory>
 #include <sstream>
+#include <string>
+#include <vector>
 
 namespace cudf {
 namespace io {
 namespace detail {
 namespace csv {
 
+using namespace cudf::io::csv;
+using namespace cudf::io;
+
 namespace {
 
 /**
@@ -260,32 +274,16 @@ struct column_to_strings_fn {
 };
 }  // unnamed namespace
 
-// Forward to implementation
-writer::writer(std::unique_ptr<data_sink> sink,
-               csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(std::move(sink), options, mr))
-{
-}
-
-// Destructor within this translation unit
-writer::~writer() = default;
-
-writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   csv_writer_options const& options,
-                   rmm::mr::device_memory_resource* mr)
-  : out_sink_(std::move(sink)), mr_(mr), options_(options)
-{
-}
-
 // write the header: column names:
 //
-void writer::impl::write_chunked_begin(table_view const& table,
-                                       const table_metadata* metadata,
-                                       rmm::cuda_stream_view stream)
+void write_chunked_begin(data_sink* out_sink,
+                         table_view const& table,
+                         table_metadata const* metadata,
+                         csv_writer_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
 {
-  if (options_.is_enabled_include_header()) {
+  if (options.is_enabled_include_header()) {
     // need to generate column names if metadata is not provided
     std::vector<std::string> generated_col_names;
     if (metadata == nullptr) {
@@ -298,8 +296,8 @@ void writer::impl::write_chunked_begin(table_view const& table,
     CUDF_EXPECTS(column_names.size() == static_cast<size_t>(table.num_columns()),
                  "Mismatch between number of column headers and table columns.");
 
-    auto const delimiter  = options_.get_inter_column_delimiter();
-    auto const terminator = options_.get_line_terminator();
+    auto const delimiter  = options.get_inter_column_delimiter();
+    auto const terminator = options.get_line_terminator();
 
     // process header names:
     // - if the header name includes the delimiter or terminator character,
@@ -341,18 +339,21 @@ void writer::impl::write_chunked_begin(table_view const& table,
     }
     header.append(terminator);
 
-    out_sink_->host_write(header.data(), header.size());
+    out_sink->host_write(header.data(), header.size());
   }
 }
 
-void writer::impl::write_chunked(strings_column_view const& str_column_view,
-                                 const table_metadata* metadata,
-                                 rmm::cuda_stream_view stream)
+void write_chunked(data_sink* out_sink,
+                   strings_column_view const& str_column_view,
+                   table_metadata const* metadata,
+                   csv_writer_options const& options,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
 {
   // algorithm outline:
   //
   //  for_each(strings_column.begin(), strings_column.end(),
-  //           [sink = out_sink_](auto str_row) mutable {
+  //           [sink = out_sink](auto str_row) mutable {
   //               auto host_buffer = str_row.host_buffer();
   //               sink->host_write(host_buffer_.data(), host_buffer_.size());
   //           });//or...sink->device_write(device_buffer,...);
@@ -362,7 +363,7 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
 
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
-  cudf::string_scalar newline{options_.get_line_terminator()};
+  cudf::string_scalar newline{options.get_line_terminator()};
   auto p_str_col_w_nl =
     cudf::strings::detail::join_strings(str_column_view, newline, string_scalar("", false), stream);
   strings_column_view strings_column{p_str_col_w_nl->view()};
@@ -370,9 +371,9 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
   auto total_num_bytes      = strings_column.chars_size();
   char const* ptr_all_bytes = strings_column.chars_begin();
 
-  if (out_sink_->is_device_write_preferred(total_num_bytes)) {
+  if (out_sink->is_device_write_preferred(total_num_bytes)) {
     // Direct write from device memory
-    out_sink_->device_write(ptr_all_bytes, total_num_bytes, stream);
+    out_sink->device_write(ptr_all_bytes, total_num_bytes, stream);
   } else {
     // copy the bytes to host to write them out
     thrust::host_vector<char> h_bytes(total_num_bytes);
@@ -383,30 +384,33 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
                              stream.value()));
     stream.synchronize();
 
-    out_sink_->host_write(h_bytes.data(), total_num_bytes);
+    out_sink->host_write(h_bytes.data(), total_num_bytes);
   }
 
   // Needs newline at the end, to separate from next chunk
-  if (out_sink_->is_device_write_preferred(newline.size())) {
-    out_sink_->device_write(newline.data(), newline.size(), stream);
+  if (out_sink->is_device_write_preferred(newline.size())) {
+    out_sink->device_write(newline.data(), newline.size(), stream);
   } else {
-    out_sink_->host_write(options_.get_line_terminator().data(),
-                          options_.get_line_terminator().size());
+    out_sink->host_write(options.get_line_terminator().data(),
+                         options.get_line_terminator().size());
   }
 }
 
-void writer::impl::write(table_view const& table,
-                         const table_metadata* metadata,
-                         rmm::cuda_stream_view stream)
+void write_csv(data_sink* out_sink,
+               table_view const& table,
+               table_metadata const* metadata,
+               csv_writer_options const& options,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
   //
-  write_chunked_begin(table, metadata, stream);
+  write_chunked_begin(out_sink, table, metadata, options, stream, mr);
 
   if (table.num_rows() > 0) {
     // no need to check same-size columns constraint; auto-enforced by table_view
-    auto n_rows_per_chunk = options_.get_rows_per_chunk();
+    auto n_rows_per_chunk = options.get_rows_per_chunk();
     //
     // This outputs the CSV in row chunks to save memory.
     // Maybe we can use the total_rows*count calculation and a memory threshold
@@ -436,7 +440,7 @@ void writer::impl::write(table_view const& table,
 
     // convert each chunk to CSV:
     //
-    column_to_strings_fn converter{options_, stream, rmm::mr::get_current_device_resource()};
+    column_to_strings_fn converter{options, stream, rmm::mr::get_current_device_resource()};
     for (auto&& sub_view : vector_views) {
       // Skip if the table has no rows
       if (sub_view.num_rows() == 0) continue;
@@ -459,32 +463,21 @@ void writer::impl::write(table_view const& table,
       // concatenate columns in each row into one big string column
       // (using null representation and delimiter):
       //
-      std::string delimiter_str{options_.get_inter_column_delimiter()};
+      std::string delimiter_str{options.get_inter_column_delimiter()};
       auto str_concat_col = [&] {
         if (str_table_view.num_columns() > 1)
           return cudf::strings::detail::concatenate(str_table_view,
                                                     delimiter_str,
-                                                    options_.get_na_rep(),
+                                                    options.get_na_rep(),
                                                     strings::separator_on_nulls::YES,
                                                     stream);
-        cudf::string_scalar narep{options_.get_na_rep()};
+        cudf::string_scalar narep{options.get_na_rep()};
         return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream);
       }();
 
-      write_chunked(str_concat_col->view(), metadata, stream);
+      write_chunked(out_sink, str_concat_col->view(), metadata, options, stream, mr);
     }
   }
-
-  // finalize (no-op, for now, but offers a hook for future extensions):
-  //
-  write_chunked_end(table, metadata, stream);
-}
-
-void writer::write(table_view const& table,
-                   const table_metadata* metadata,
-                   rmm::cuda_stream_view stream)
-{
-  _impl->write(table, metadata, stream);
 }
 
 }  // namespace csv
diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp
deleted file mode 100644
index 965c036dc75..00000000000
--- a/cpp/src/io/csv/writer_impl.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "csv_common.h"
-#include "csv_gpu.h"
-
-#include <cudf/strings/strings_column_view.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
-
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/io/data_sink.hpp>
-#include <cudf/io/detail/csv.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace cudf {
-namespace io {
-namespace detail {
-namespace csv {
-
-using namespace cudf::io::csv;
-using namespace cudf::io;
-
-/**
- * @brief Implementation for CSV writer
- */
-class writer::impl {
- public:
-  /**
-   * @brief Constructor with writer options.
-   *
-   * @param sink Output sink
-   * @param options Settings for controlling behavior
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  impl(std::unique_ptr<data_sink> sink,
-       csv_writer_options const& options,
-       rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Write an entire dataset to CSV format.
-   *
-   * @param table The set of columns
-   * @param metadata The metadata associated with the table
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write(table_view const& table,
-             const table_metadata* metadata = nullptr,
-             rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-
-  /**
-   * @brief Write the header of a CSV format.
-   *
-   * @param table The set of columns
-   * @param metadata The metadata associated with the table
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write_chunked_begin(table_view const& table,
-                           const table_metadata* metadata = nullptr,
-                           rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-
-  /**
-   * @brief Write dataset to CSV format without header.
-   *
-   * @param strings_column Subset of columns converted to string to be written.
-   * @param metadata The metadata associated with the table
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write_chunked(strings_column_view const& strings_column,
-                     const table_metadata* metadata = nullptr,
-                     rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-
-  /**
-   * @brief Write footer of CSV format (typically, empty).
-   *
-   * @param table The set of columns
-   * @param metadata The metadata associated with the table
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write_chunked_end(table_view const& table,
-                         const table_metadata* metadata = nullptr,
-                         rmm::cuda_stream_view stream   = rmm::cuda_stream_default)
-  {
-    // purposely no-op (for now);
-  }
-
- private:
-  std::unique_ptr<data_sink> out_sink_;
-  rmm::mr::device_memory_resource* mr_ = nullptr;
-  csv_writer_options const options_;
-};
-
-std::unique_ptr<column> pandas_format_durations(
-  column_view const& durations,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-}  // namespace csv
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index b678941db21..402e212f07b 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -219,10 +219,14 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
   using namespace cudf::io::detail;
 
   auto sink = make_datasink(options.get_sink());
-  auto writer =
-    std::make_unique<csv::writer>(std::move(sink), options, rmm::cuda_stream_default, mr);
 
-  writer->write(options.get_table(), options.get_metadata());
+  return csv::write_csv(  //
+    sink.get(),
+    options.get_table(),
+    options.get_metadata(),
+    options,
+    rmm::cuda_stream_default,
+    mr);
 }
 
 namespace detail_orc = cudf::io::detail::orc;
@@ -405,13 +409,13 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
 }
 
 /**
- * @copydoc cudf::io::merge_rowgroup_metadata
+ * @copydoc cudf::io::merge_row_group_metadata
  */
-std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
+std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
   const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list)
 {
   CUDF_FUNC_RANGE();
-  return detail_parquet::writer::merge_rowgroup_metadata(metadata_list);
+  return detail_parquet::writer::merge_row_group_metadata(metadata_list);
 }
 
 table_input_metadata::table_input_metadata(table_view const& table,
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index b5ed43558d2..c655d18a4df 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -196,6 +196,14 @@ __inline__ __device__ numeric::decimal64 decode_value(const char*,
   return numeric::decimal64{};
 }
 
+template <>
+__inline__ __device__ numeric::decimal128 decode_value(const char*,
+                                                       const char*,
+                                                       parse_options_view const&)
+{
+  return numeric::decimal128{};
+}
+
 /**
  * @brief Functor for converting plain text data to cuDF data type value.
  */
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 45d60605936..82161233a92 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -79,7 +79,7 @@ void add_nested_columns(std::map<size_type, std::vector<size_type>>& selected_co
  * @brief Adds the column with the given id to the mapping
  *
  * All nested columns and direct ancestors of column `id` are included.
- * Columns that are not on the direct path are excluded, which may result in prunning.
+ * Columns that are not on the direct path are excluded, which may result in pruning.
  */
 void add_column_to_mapping(std::map<size_type, std::vector<size_type>>& selected_columns,
                            metadata const& metadata,
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 356d20843e8..01418fd3bd6 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -86,7 +86,7 @@ class aggregate_orc_metadata {
   /**
    * @brief Returns the name of the given column from the given source.
    */
-  auto column_name(const int source_idx, const int column_id) const
+  std::string const& column_name(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
@@ -98,7 +98,7 @@ class aggregate_orc_metadata {
    *
    * Full name includes ancestor columns' names.
    */
-  auto column_path(const int source_idx, const int column_id) const
+  std::string const& column_path(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
@@ -119,7 +119,7 @@ class aggregate_orc_metadata {
    * @brief Filters ORC file to a selection of columns, based on their paths in the file.
    *
    * Paths are in format "grandparent_col.parent_col.child_col", where the root ORC column is
-   * ommited to match the cuDF table hierarchy.
+   * omitted to match the cuDF table hierarchy.
    *
    * @param column_paths List of full column names (i.e. paths) to select from the ORC file
    * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 89eac0c9901..44cea6169e4 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -18,6 +18,8 @@
 #include "orc_field_reader.hpp"
 #include "orc_field_writer.hpp"
 
+#include <cudf/lists/lists_column_view.hpp>
+
 #include <thrust/tabulate.h>
 
 #include <string>
@@ -472,10 +474,16 @@ void metadata::init_column_names()
   thrust::tabulate(column_names.begin(), column_names.end(), [&](auto col_id) {
     if (not column_has_parent(col_id)) return std::string{};
     auto const& parent_field_names = ff.types[parent_id(col_id)].fieldNames;
-    // Child columns of lists don't have a name in ORC files, generate placeholder in that case
-    return field_index(col_id) < static_cast<size_type>(parent_field_names.size())
-             ? parent_field_names[field_index(col_id)]
-             : std::to_string(col_id);
+    if (field_index(col_id) < static_cast<size_type>(parent_field_names.size())) {
+      return parent_field_names[field_index(col_id)];
+    }
+
+    // Generate names for list and map child columns
+    if (ff.types[parent_id(col_id)].subtypes.size() == 1) {
+      return std::to_string(lists_column_view::child_column_index);
+    } else {
+      return std::to_string(field_index(col_id));
+    }
   });
 
   column_paths.resize(get_num_columns());
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index f6a7c3f5f03..ad4450bc6a7 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -84,11 +84,6 @@ struct DictionaryEntry {
   uint32_t len;  // Length in data stream
 };
 
-/**
- * @brief Mask to indicate conversion from decimals to float64
- */
-constexpr int orc_decimal2float64_scale = 0x80;
-
 /**
  * @brief Struct to describe per stripe's column information
  */
@@ -111,6 +106,7 @@ struct ColumnDesc {
   ColumnEncodingKind encoding_kind;        // column encoding kind
   TypeKind type_kind;                      // column data type
   uint8_t dtype_len;          // data type length (for types that can be mapped to different sizes)
+  type_id dtype_id;           // TODO
   int32_t decimal_scale;      // number of fractional decimal digits for decimal type
   type_id timestamp_type_id;  // output timestamp type id (type_id::EMPTY by default)
   column_validity_info parent_validity_info;  // consists of parent column valid_map and null count
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 31d4184993f..798cdca178a 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -57,7 +57,7 @@ namespace {
 constexpr type_id to_type_id(const orc::SchemaType& schema,
                              bool use_np_dtypes,
                              type_id timestamp_type_id,
-                             bool decimals_as_float64)
+                             type_id decimal_type_id)
 {
   switch (schema.kind) {
     case orc::BOOLEAN: return type_id::BOOL8;
@@ -79,7 +79,7 @@ constexpr type_id to_type_id(const orc::SchemaType& schema,
     case orc::DATE:
       // There isn't a (DAYS -> np.dtype) mapping
       return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
-    case orc::DECIMAL: return (decimals_as_float64) ? type_id::FLOAT64 : type_id::DECIMAL64;
+    case orc::DECIMAL: return decimal_type_id;
     // Need to update once cuDF plans to support map type
     case orc::MAP:
     case orc::LIST: return type_id::LIST;
@@ -228,15 +228,26 @@ size_t gather_stream_info(const size_t stripe_index,
 }
 
 /**
- * @brief Determines if a column should be converted from decimal to float
+ * @brief Determines cuDF type of an ORC Decimal column.
  */
-bool should_convert_decimal_column_to_float(const std::vector<std::string>& columns_to_convert,
-                                            cudf::io::orc::metadata& metadata,
-                                            int column_index)
+auto decimal_column_type(const std::vector<std::string>& float64_columns,
+                         const std::vector<std::string>& decimal128_columns,
+                         cudf::io::orc::metadata& metadata,
+                         int column_index)
 {
-  return (std::find(columns_to_convert.begin(),
-                    columns_to_convert.end(),
-                    metadata.column_name(column_index)) != columns_to_convert.end());
+  auto const& column_path = metadata.column_path(column_index);
+  auto is_column_in       = [&](const std::vector<std::string>& cols) {
+    return std::find(cols.cbegin(), cols.cend(), column_path) != cols.end();
+  };
+
+  auto const user_selected_float64    = is_column_in(float64_columns);
+  auto const user_selected_decimal128 = is_column_in(decimal128_columns);
+  CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128,
+               "Both decimal128 and float64 types selected for column " + column_path);
+
+  if (user_selected_float64) return type_id::FLOAT64;
+  if (user_selected_decimal128) return type_id::DECIMAL128;
+  return type_id::DECIMAL64;
 }
 
 }  // namespace
@@ -728,12 +739,12 @@ std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_co
                                                           rmm::cuda_stream_view stream)
 {
   schema_info.name = _metadata.column_name(0, orc_col_id);
-  // If the column type is orc::DECIMAL see if the user
-  // desires it to be converted to float64 or not
-  auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-    _decimal_cols_as_float, _metadata.per_file_metadata[0], orc_col_id);
-  auto const type = to_type_id(
-    _metadata.get_schema(orc_col_id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
+  auto const type  = to_type_id(
+    _metadata.get_schema(orc_col_id),
+    _use_np_dtypes,
+    _timestamp_type.id(),
+    decimal_column_type(
+      _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], orc_col_id));
   int32_t scale = 0;
   std::vector<std::unique_ptr<column>> child_columns;
   std::unique_ptr<column> out_col = nullptr;
@@ -784,7 +795,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_co
       break;
 
     case orc::DECIMAL:
-      if (type == type_id::DECIMAL64) {
+      if (type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
         scale = -static_cast<int32_t>(_metadata.get_types()[orc_col_id].scale.value_or(0));
       }
       out_col = make_empty_column(data_type(type, scale));
@@ -875,8 +886,9 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   // Enable or disable the conversion to numpy-compatible dtypes
   _use_np_dtypes = options.is_enabled_use_np_dtypes();
 
-  // Control decimals conversion (float64 or int64 with optional scale)
+  // Control decimals conversion
   _decimal_cols_as_float = options.get_decimal_cols_as_float();
+  decimal128_columns     = options.get_decimal128_columns();
 }
 
 timezone_table reader::impl::compute_timezone_table(
@@ -936,18 +948,18 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     // Get a list of column data types
     std::vector<data_type> column_types;
     for (auto& col : columns_level) {
-      // If the column type is orc::DECIMAL see if the user
-      // desires it to be converted to float64 or not
-      auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-        _decimal_cols_as_float, _metadata.per_file_metadata[0], col.id);
       auto col_type = to_type_id(
-        _metadata.get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
+        _metadata.get_col_type(col.id),
+        _use_np_dtypes,
+        _timestamp_type.id(),
+        decimal_column_type(
+          _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], col.id));
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      // Remove this once we support Decimal128 data type
       CUDF_EXPECTS(
         (col_type != type_id::DECIMAL64) or (_metadata.get_col_type(col.id).precision <= 18),
-        "Decimal data has precision > 18, Decimal64 data type doesn't support it.");
-      if (col_type == type_id::DECIMAL64) {
+        "Precision of column " + std::string{_metadata.column_name(0, col.id)} +
+          " is over 18, use 128-bit Decimal.");
+      if (col_type == type_id::DECIMAL64 or col_type == type_id::DECIMAL128) {
         // sign of the scale is changed since cuDF follows c++ libraries like CNL
         // which uses negative scaling, but liborc and other libraries
         // follow positive scaling.
@@ -1114,13 +1126,11 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                 .kind;
             // num_child_rows for a struct column will be same, for other nested types it will be
             // calculated.
-            chunk.num_child_rows          = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-            auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-              _decimal_cols_as_float, _metadata.per_file_metadata[0], columns_level[col_idx].id);
-            chunk.decimal_scale = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
+            chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+            chunk.dtype_id       = column_types[col_idx].id();
+            chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
                                     .ff.types[columns_level[col_idx].id]
-                                    .scale.value_or(0) |
-                                  (decimal_as_float64 ? orc::gpu::orc_decimal2float64_scale : 0);
+                                    .scale.value_or(0);
 
             chunk.rowgroup_id   = rowgroup_id;
             chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index c9de2211d48..64e7cbc74e5 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -222,6 +222,7 @@ class reader::impl {
   bool _use_index     = true;
   bool _use_np_dtypes = true;
   std::vector<std::string> _decimal_cols_as_float;
+  std::vector<std::string> decimal128_columns;
   data_type _timestamp_type{type_id::EMPTY};
   reader_column_meta _col_meta;
 };
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index bcbe77d9df8..44f106c4f5c 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -45,11 +45,6 @@ inline __device__ uint8_t is_dictionary(uint8_t encoding_mode) { return encoding
 static __device__ __constant__ int64_t kORCTimeToUTC =
   1420070400;  // Seconds from January 1st, 1970 to January 1st, 2015
 
-struct int128_s {
-  uint64_t lo;
-  int64_t hi;
-};
-
 struct orc_bytestream_s {
   const uint8_t* base;
   uint32_t pos;
@@ -127,12 +122,14 @@ struct orcdec_state_s {
     orc_rowdec_state_s rowdec;
   } u;
   union values {
-    uint8_t u8[block_size * 8];
-    uint32_t u32[block_size * 2];
-    int32_t i32[block_size * 2];
-    uint64_t u64[block_size];
-    int64_t i64[block_size];
-    double f64[block_size];
+    uint8_t u8[block_size * 16];
+    uint32_t u32[block_size * 4];
+    int32_t i32[block_size * 4];
+    uint64_t u64[block_size * 2];
+    int64_t i64[block_size * 2];
+    double f64[block_size * 2];
+    __int128_t i128[block_size];
+    __uint128_t u128[block_size];
   } vals;
 };
 
@@ -451,29 +448,18 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p
 /**
  * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals)
  */
-inline __device__ int128_s decode_varint128(volatile orc_bytestream_s* bs, int pos)
+inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos)
 {
-  uint32_t b        = bytestream_readbyte(bs, pos++);
-  int64_t sign_mask = -(int32_t)(b & 1);
-  uint64_t v        = (b >> 1) & 0x3f;
-  uint32_t bitpos   = 6;
-  uint64_t lo       = v;
-  uint64_t hi       = 0;
-  while (b > 0x7f && bitpos < 128) {
-    b = bytestream_readbyte(bs, pos++);
-    v |= ((uint64_t)(b & 0x7f)) << (bitpos & 0x3f);
-    if (bitpos == 62) {  // 6 + 7 * 8 = 62
-      lo = v;
-      v  = (b & 0x7f) >> 2;  // 64 - 62
-    }
+  auto byte                  = bytestream_readbyte(bs, pos++);
+  __int128_t const sign_mask = -(int32_t)(byte & 1);
+  __int128_t value           = (byte >> 1) & 0x3f;
+  uint32_t bitpos            = 6;
+  while (byte & 0x80 && bitpos < 128) {
+    byte = bytestream_readbyte(bs, pos++);
+    value |= ((__uint128_t)(byte & 0x7f)) << bitpos;
     bitpos += 7;
   }
-  if (bitpos >= 64) {
-    hi = v;
-  } else {
-    lo = v;
-  }
-  return {(uint64_t)(lo ^ sign_mask), (int64_t)(hi ^ sign_mask)};
+  return value ^ sign_mask;
 }
 
 /**
@@ -1031,6 +1017,7 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
                                       volatile orcdec_state_s::values& vals,
                                       int val_scale,
                                       int numvals,
+                                      type_id dtype_id,
                                       int col_scale,
                                       int t)
 {
@@ -1046,8 +1033,8 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
         uint32_t pos = lastpos;
         pos += varint_length<uint4>(bs, pos);
         if (pos > maxpos) break;
-        vals.i64[n] = lastpos;
-        lastpos     = pos;
+        vals.i64[2 * n] = lastpos;
+        lastpos         = pos;
       }
       scratch->num_vals = n;
       bytestream_flush_bytes(bs, lastpos - bs->pos);
@@ -1055,43 +1042,36 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
     __syncthreads();
     uint32_t num_vals_to_read = scratch->num_vals;
     if (t >= num_vals_read and t < num_vals_to_read) {
-      auto const pos = static_cast<int>(vals.i64[t]);
-      int128_s v     = decode_varint128(bs, pos);
+      auto const pos = static_cast<int>(vals.i64[2 * t]);
+      __int128_t v   = decode_varint128(bs, pos);
 
-      if (col_scale & orc_decimal2float64_scale) {
-        double f      = Int128ToDouble_rn(v.lo, v.hi);
+      if (dtype_id == type_id::FLOAT64) {
+        double f      = v;
         int32_t scale = (t < numvals) ? val_scale : 0;
         if (scale >= 0)
           vals.f64[t] = f / kPow10[min(scale, 39)];
         else
           vals.f64[t] = f * kPow10[min(-scale, 39)];
       } else {
-        // Since cuDF column stores just one scale, value needs to
-        // be adjusted to col_scale from val_scale. So the difference
-        // of them will be used to add 0s or remove digits.
-        int32_t scale = (t < numvals) ? col_scale - val_scale : 0;
-        if (scale >= 0) {
-          scale       = min(scale, 27);
-          vals.i64[t] = ((int64_t)v.lo * kPow5i[scale]) << scale;
-        } else  // if (scale < 0)
-        {
-          bool is_negative = (v.hi < 0);
-          uint64_t hi = v.hi, lo = v.lo;
-          scale = min(-scale, 27);
-          if (is_negative) {
-            hi = (~hi) + (lo == 0);
-            lo = (~lo) + 1;
+        auto const scaled_value = [&]() {
+          // Since cuDF column stores just one scale, value needs to be adjusted to col_scale from
+          // val_scale. So the difference of them will be used to add 0s or remove digits.
+          int32_t scale = (t < numvals) ? col_scale - val_scale : 0;
+          if (scale >= 0) {
+            scale = min(scale, 27);
+            return (v * kPow5i[scale]) << scale;
+          } else  // if (scale < 0)
+          {
+            scale = min(-scale, 27);
+            return (v / kPow5i[scale]) >> scale;
           }
-          lo = (lo >> (uint32_t)scale) | ((uint64_t)hi << (64 - scale));
-          hi >>= (int32_t)scale;
-          if (hi != 0) {
-            // Use intermediate float
-            lo = __double2ull_rn(Int128ToDouble_rn(lo, hi) / __ll2double_rn(kPow5i[scale]));
-            hi = 0;
-          } else {
-            lo /= kPow5i[scale];
+        }();
+        if (dtype_id == type_id::DECIMAL64) {
+          vals.i64[t] = scaled_value;
+        } else {
+          {
+            vals.i128[t] = scaled_value;
           }
-          vals.i64[t] = (is_negative) ? -(int64_t)lo : (int64_t)lo;
         }
       }
     }
@@ -1653,8 +1633,14 @@ __global__ void __launch_bounds__(block_size)
           }
           val_scale = (t < numvals) ? (int)s->vals.i64[skip + t] : 0;
           __syncthreads();
-          numvals = Decode_Decimals(
-            &s->bs, &s->u.rle8, s->vals, val_scale, numvals, s->chunk.decimal_scale, t);
+          numvals = Decode_Decimals(&s->bs,
+                                    &s->u.rle8,
+                                    s->vals,
+                                    val_scale,
+                                    numvals,
+                                    s->chunk.dtype_id,
+                                    s->chunk.decimal_scale,
+                                    t);
         }
         __syncthreads();
       } else if (s->chunk.type_kind == FLOAT) {
@@ -1720,9 +1706,15 @@ __global__ void __launch_bounds__(block_size)
             case FLOAT:
             case INT: static_cast<uint32_t*>(data_out)[row] = s->vals.u32[t + vals_skipped]; break;
             case DOUBLE:
-            case LONG:
+            case LONG: static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped]; break;
             case DECIMAL:
-              static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
+              if (s->chunk.dtype_id == type_id::FLOAT64 or
+                  s->chunk.dtype_id == type_id::DECIMAL64) {
+                static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
+              } else {
+                // decimal128
+                static_cast<__uint128_t*>(data_out)[row] = s->vals.u128[t + vals_skipped];
+              }
               break;
             case MAP:
             case LIST: {
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index ff7b642be0e..829e4877c44 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -117,6 +117,12 @@ static inline __device__ uint64_t zigzag(int64_t v)
   return ((v ^ -s) * 2) + s;
 }
 
+static inline __device__ __uint128_t zigzag(__int128_t v)
+{
+  int64_t s = (v < 0) ? 1 : 0;
+  return ((v ^ -s) * 2) + s;
+}
+
 static inline __device__ uint32_t CountLeadingBytes32(uint32_t v) { return __clz(v) >> 3; }
 static inline __device__ uint32_t CountLeadingBytes64(uint64_t v) { return __clzll(v) >> 3; }
 
@@ -279,11 +285,11 @@ static const __device__ __constant__ uint8_t kByteLengthToRLEv2_W[9] = {
 /**
  * @brief Encode a varint value, return the number of bytes written
  */
-static inline __device__ uint32_t StoreVarint(uint8_t* dst, uint64_t v)
+static inline __device__ uint32_t StoreVarint(uint8_t* dst, __uint128_t v)
 {
   uint32_t bytecnt = 0;
   for (;;) {
-    uint32_t c = (uint32_t)(v & 0x7f);
+    auto c = static_cast<uint32_t>(v & 0x7f);
     v >>= 7u;
     if (v == 0) {
       dst[bytecnt++] = c;
@@ -654,7 +660,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s,
     auto const mask_byte = get_mask_byte(column.null_mask(), column.offset());
     auto dst_offset      = offset + s->nnz;
     auto vbuf_bit_idx    = [](int row) {
-      // valid_buf is a circular buffer with validitiy of 8 rows in each element
+      // valid_buf is a circular buffer with validity of 8 rows in each element
       return row % (encode_block_size * 8);
     };
     if (dst_offset % 8 == 0 and pd_set_cnt == 8) {
@@ -690,7 +696,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s,
         ByteRLE<CI_PRESENT, 0x1ff>(s, s->valid_buf, s->present_out / 8, nbytes_out, flush, t) * 8;
 
       if (!t) {
-        // Number of rows enocoded so far
+        // Number of rows encoded so far
         s->present_out += nrows_encoded;
         s->numvals -= min(s->numvals, nrows_encoded);
       }
@@ -938,9 +944,11 @@ __global__ void __launch_bounds__(block_size)
             break;
           case DECIMAL: {
             if (is_value_valid) {
-              uint64_t const zz_val = (column.type().id() == type_id::DECIMAL32)
-                                        ? zigzag(column.element<int32_t>(row))
-                                        : zigzag(column.element<int64_t>(row));
+              auto const id = column.type().id();
+              __uint128_t const zz_val =
+                id == type_id::DECIMAL32   ? zigzag(column.element<int32_t>(row))
+                : id == type_id::DECIMAL64 ? zigzag(column.element<int64_t>(row))
+                                           : zigzag(column.element<__int128_t>(row));
               auto const offset =
                 (row == s->chunk.start_row) ? 0 : s->chunk.decimal_offsets[row - 1];
               StoreVarint(s->stream.data_ptrs[CI_DATA] + offset, zz_val);
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 1e580e360ca..db02125ce77 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -43,6 +43,8 @@
 #include <numeric>
 #include <utility>
 
+#include <cuda/std/limits>
+
 namespace cudf {
 namespace io {
 namespace detail {
@@ -97,7 +99,8 @@ constexpr orc::TypeKind to_orc_type(cudf::type_id id, bool list_column_as_map)
     case cudf::type_id::TIMESTAMP_NANOSECONDS: return TypeKind::TIMESTAMP;
     case cudf::type_id::STRING: return TypeKind::STRING;
     case cudf::type_id::DECIMAL32:
-    case cudf::type_id::DECIMAL64: return TypeKind::DECIMAL;
+    case cudf::type_id::DECIMAL64:
+    case cudf::type_id::DECIMAL128: return TypeKind::DECIMAL;
     case cudf::type_id::LIST: return list_column_as_map ? TypeKind::MAP : TypeKind::LIST;
     case cudf::type_id::STRUCT: return TypeKind::STRUCT;
     default: return TypeKind::INVALID_TYPE_KIND;
@@ -123,9 +126,11 @@ constexpr int32_t to_clockscale(cudf::type_id timestamp_id)
  */
 constexpr auto orc_precision(cudf::type_id decimal_id)
 {
+  using namespace numeric;
   switch (decimal_id) {
-    case cudf::type_id::DECIMAL32: return 9;
-    case cudf::type_id::DECIMAL64: return 18;
+    case cudf::type_id::DECIMAL32: return cuda::std::numeric_limits<decimal32::rep>::digits10;
+    case cudf::type_id::DECIMAL64: return cuda::std::numeric_limits<decimal64::rep>::digits10;
+    case cudf::type_id::DECIMAL128: return cuda::std::numeric_limits<decimal128::rep>::digits10;
     default: return 0;
   }
 }
@@ -574,12 +579,15 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         }
 
         auto const direct_data_size =
-          std::accumulate(segmentation.stripes.front().cbegin(),
-                          segmentation.stripes.back().cend(),
-                          size_t{0},
-                          [&](auto data_size, auto rg_idx) {
-                            return data_size + column.host_dict_chunk(rg_idx)->string_char_count;
-                          });
+          segmentation.num_stripes() == 0
+            ? 0
+            : std::accumulate(segmentation.stripes.front().cbegin(),
+                              segmentation.stripes.back().cend(),
+                              size_t{0},
+                              [&](auto data_size, auto rg_idx) {
+                                return data_size +
+                                       column.host_dict_chunk(rg_idx)->string_char_count;
+                              });
         if (enable_dict) {
           uint32_t dict_bits = 0;
           for (dict_bits = 1; dict_bits < 32; dict_bits <<= 1) {
@@ -983,17 +991,19 @@ encoded_data encode_columns(orc_table_view const& orc_table,
   }
   chunk_streams.host_to_device(stream);
 
-  if (orc_table.num_string_columns() != 0) {
-    auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
-    gpu::EncodeStripeDictionaries(d_stripe_dict,
-                                  chunks,
-                                  orc_table.num_string_columns(),
-                                  segmentation.num_stripes(),
-                                  chunk_streams,
-                                  stream);
-  }
+  if (orc_table.num_rows() > 0) {
+    if (orc_table.num_string_columns() != 0) {
+      auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
+      gpu::EncodeStripeDictionaries(d_stripe_dict,
+                                    chunks,
+                                    orc_table.num_string_columns(),
+                                    segmentation.num_stripes(),
+                                    chunk_streams,
+                                    stream);
+    }
 
-  gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+    gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+  }
   dictionaries.data.clear();
   dictionaries.index.clear();
   stream.synchronize();
@@ -1296,10 +1306,10 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
-    max_stripe_size{options.stripe_size_bytes(), options.stripe_size_rows()},
-    row_index_stride{options.row_index_stride()},
+    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
+    row_index_stride{options.get_row_index_stride()},
     compression_kind_(to_orc_compression(options.get_compression())),
-    enable_statistics_(options.enable_statistics()),
+    enable_statistics_(options.is_enabled_statistics()),
     single_write_mode(mode == SingleWriteMode::YES),
     out_sink_(std::move(sink))
 {
@@ -1316,10 +1326,10 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
-    max_stripe_size{options.stripe_size_bytes(), options.stripe_size_rows()},
-    row_index_stride{options.row_index_stride()},
+    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
+    row_index_stride{options.get_row_index_stride()},
     compression_kind_(to_orc_compression(options.get_compression())),
-    enable_statistics_(options.enable_statistics()),
+    enable_statistics_(options.is_enabled_statistics()),
     single_write_mode(mode == SingleWriteMode::YES),
     out_sink_(std::move(sink))
 {
@@ -1421,7 +1431,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
       }
     }
     if (col.orc_kind() == LIST or col.orc_kind() == MAP) {
-      // Need a new pushdown mask unless both the parent and current colmn are not nullable
+      // Need a new pushdown mask unless both the parent and current column are not nullable
       auto const child_col = orc_table.column(col.child_begin()[0]);
       // pushdown mask applies to child column(s); use the child column size
       pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream);
@@ -1491,15 +1501,23 @@ orc_table_view make_orc_table_view(table_view const& table,
           append_orc_column(col.child(lists_column_view::child_column_index),
                             &orc_columns[new_col_idx],
                             col_meta.child(lists_column_view::child_column_index));
-        } else if (kind == TypeKind::STRUCT or kind == TypeKind::MAP) {
-          // MAP: skip to the list child - include grandchildren columns instead of children
-          auto const real_parent_col =
-            kind == TypeKind::MAP ? col.child(lists_column_view::child_column_index) : col;
-          for (auto child_idx = 0; child_idx != real_parent_col.num_children(); ++child_idx) {
-            append_orc_column(real_parent_col.child(child_idx),
-                              &orc_columns[new_col_idx],
-                              col_meta.child(child_idx));
+        } else if (kind == TypeKind::STRUCT) {
+          for (auto child_idx = 0; child_idx != col.num_children(); ++child_idx) {
+            append_orc_column(
+              col.child(child_idx), &orc_columns[new_col_idx], col_meta.child(child_idx));
           }
+        } else if (kind == TypeKind::MAP) {
+          // MAP: skip to the list child - include grandchildren columns instead of children
+          auto const real_parent_col   = col.child(lists_column_view::child_column_index);
+          auto const& real_parent_meta = col_meta.child(lists_column_view::child_column_index);
+          CUDF_EXPECTS(real_parent_meta.num_children() == 2,
+                       "Map struct column should have exactly two children");
+          // process MAP key
+          append_orc_column(
+            real_parent_col.child(0), &orc_columns[new_col_idx], real_parent_meta.child(0));
+          // process MAP value
+          append_orc_column(
+            real_parent_col.child(1), &orc_columns[new_col_idx], real_parent_meta.child(1));
         }
       };
 
@@ -1637,13 +1655,17 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                            if (!parent_index.has_value()) return nullptr;
                            return d_cols[parent_index.value()].pushdown_mask;
                          }();
+
                          if (col.is_null(idx) or not bit_value_or(pushdown_mask, idx, true))
                            return 0u;
-                         int64_t const element   = (col.type().id() == type_id::DECIMAL32)
-                                                     ? col.element<int32_t>(idx)
-                                                     : col.element<int64_t>(idx);
-                         int64_t const sign      = (element < 0) ? 1 : 0;
-                         uint64_t zigzaged_value = ((element ^ -sign) * 2) + sign;
+
+                         __int128_t const element =
+                           col.type().id() == type_id::DECIMAL32   ? col.element<int32_t>(idx)
+                           : col.type().id() == type_id::DECIMAL64 ? col.element<int64_t>(idx)
+                                                                   : col.element<__int128_t>(idx);
+
+                         __int128_t const sign      = (element < 0) ? 1 : 0;
+                         __uint128_t zigzaged_value = ((element ^ -sign) * 2) + sign;
 
                          uint32_t encoded_length = 1;
                          while (zigzaged_value > 127) {
@@ -1767,7 +1789,7 @@ void writer::impl::write(table_view const& table)
     [&](column_in_metadata& col_meta, std::string default_name) {
       if (col_meta.get_name().empty()) col_meta.set_name(default_name);
       for (size_type i = 0; i < col_meta.num_children(); ++i) {
-        add_default_name(col_meta.child(i), col_meta.get_name() + "." + std::to_string(i));
+        add_default_name(col_meta.child(i), std::to_string(i));
       }
     };
   for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) {
@@ -1786,7 +1808,7 @@ void writer::impl::write(table_view const& table)
   auto dictionaries = allocate_dictionaries(orc_table, rowgroup_bounds, stream);
   hostdevice_2dvector<gpu::DictionaryChunk> dict(
     rowgroup_bounds.size().first, orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not dict.is_empty()) {
     init_dictionaries(orc_table,
                       rowgroup_bounds,
                       dictionaries.d_data_view,
@@ -1802,7 +1824,7 @@ void writer::impl::write(table_view const& table)
   // Build stripe-level dictionaries
   hostdevice_2dvector<gpu::StripeDictionary> stripe_dict(
     segmentation.num_stripes(), orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not stripe_dict.is_empty()) {
     build_dictionaries(orc_table,
                        segmentation.stripes,
                        dict,
@@ -1825,165 +1847,166 @@ void writer::impl::write(table_view const& table)
     segmentation.num_stripes(), num_data_streams, stream);
   auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs);
 
-  // Gather column statistics
-  std::vector<ColStatsBlob> column_stats;
-  if (enable_statistics_ && table.num_columns() > 0 && num_rows > 0) {
-    column_stats = gather_statistic_blobs(orc_table, segmentation);
-  }
+  if (num_rows > 0) {
+    // Gather column statistics
+    auto const column_stats = enable_statistics_ && table.num_columns() > 0
+                                ? gather_statistic_blobs(orc_table, segmentation)
+                                : std::vector<ColStatsBlob>{};
 
-  // Allocate intermediate output stream buffer
-  size_t compressed_bfr_size       = 0;
-  size_t num_compressed_blocks     = 0;
-  size_t max_compressed_block_size = 0;
-  if (compression_kind_ != NONE) {
-    nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-      compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
-  }
-  auto stream_output = [&]() {
-    size_t max_stream_size = 0;
-    bool all_device_write  = true;
+    // Allocate intermediate output stream buffer
+    size_t compressed_bfr_size       = 0;
+    size_t num_compressed_blocks     = 0;
+    size_t max_compressed_block_size = 0;
+    if (compression_kind_ != NONE) {
+      nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
+        compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
+    }
+    auto stream_output = [&]() {
+      size_t max_stream_size = 0;
+      bool all_device_write  = true;
+
+      for (auto& ss : strm_descs.host_view().flat_view()) {
+        if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
+        size_t stream_size = ss.stream_size;
+        if (compression_kind_ != NONE) {
+          ss.first_block = num_compressed_blocks;
+          ss.bfr_offset  = compressed_bfr_size;
+
+          auto num_blocks = std::max<uint32_t>(
+            (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
+          stream_size += num_blocks * BLOCK_HEADER_SIZE;
+          num_compressed_blocks += num_blocks;
+          compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+        }
+        max_stream_size = std::max(max_stream_size, stream_size);
+      }
 
-    for (auto& ss : strm_descs.host_view().flat_view()) {
-      if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
-      size_t stream_size = ss.stream_size;
-      if (compression_kind_ != NONE) {
-        ss.first_block = num_compressed_blocks;
-        ss.bfr_offset  = compressed_bfr_size;
-
-        auto num_blocks = std::max<uint32_t>(
-          (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
-        stream_size += num_blocks * BLOCK_HEADER_SIZE;
-        num_compressed_blocks += num_blocks;
-        compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+      if (all_device_write) {
+        return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
+      } else {
+        return pinned_buffer<uint8_t>{[](size_t size) {
+                                        uint8_t* ptr = nullptr;
+                                        CUDA_TRY(cudaMallocHost(&ptr, size));
+                                        return ptr;
+                                      }(max_stream_size),
+                                      cudaFreeHost};
       }
-      max_stream_size = std::max(max_stream_size, stream_size);
-    }
+    }();
 
-    if (all_device_write) {
-      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-    } else {
-      return pinned_buffer<uint8_t>{[](size_t size) {
-                                      uint8_t* ptr = nullptr;
-                                      CUDA_TRY(cudaMallocHost(&ptr, size));
-                                      return ptr;
-                                    }(max_stream_size),
-                                    cudaFreeHost};
+    // Compress the data streams
+    rmm::device_buffer compressed_data(compressed_bfr_size, stream);
+    hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
+    hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
+    if (compression_kind_ != NONE) {
+      strm_descs.host_to_device(stream);
+      gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
+                                  num_compressed_blocks,
+                                  compression_kind_,
+                                  compression_blocksize_,
+                                  max_compressed_block_size,
+                                  strm_descs,
+                                  enc_data.streams,
+                                  comp_in,
+                                  comp_out,
+                                  stream);
+      strm_descs.device_to_host(stream);
+      comp_out.device_to_host(stream, true);
     }
-  }();
-
-  // Compress the data streams
-  rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-  hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
-  hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
-  if (compression_kind_ != NONE) {
-    strm_descs.host_to_device(stream);
-    gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
-                                num_compressed_blocks,
-                                compression_kind_,
-                                compression_blocksize_,
-                                max_compressed_block_size,
-                                strm_descs,
-                                enc_data.streams,
-                                comp_in,
-                                comp_out,
-                                stream);
-    strm_descs.device_to_host(stream);
-    comp_out.device_to_host(stream, true);
-  }
-
-  ProtobufWriter pbw_(&buffer_);
 
-  // Write stripes
-  std::vector<std::future<void>> write_tasks;
-  for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
-    auto const& rowgroups_range = segmentation.stripes[stripe_id];
-    auto& stripe                = stripes[stripe_id];
-
-    stripe.offset = out_sink_->bytes_written();
-
-    // Column (skippable) index streams appear at the start of the stripe
-    for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
-      write_index_stream(stripe_id,
-                         stream_id,
-                         orc_table.columns,
-                         rowgroups_range,
-                         enc_data.streams,
-                         strm_descs,
-                         comp_out,
-                         &stripe,
-                         &streams,
-                         &pbw_);
-    }
+    ProtobufWriter pbw_(&buffer_);
+
+    // Write stripes
+    std::vector<std::future<void>> write_tasks;
+    for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
+      auto const& rowgroups_range = segmentation.stripes[stripe_id];
+      auto& stripe                = stripes[stripe_id];
+
+      stripe.offset = out_sink_->bytes_written();
+
+      // Column (skippable) index streams appear at the start of the stripe
+      for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
+        write_index_stream(stripe_id,
+                           stream_id,
+                           orc_table.columns,
+                           rowgroups_range,
+                           enc_data.streams,
+                           strm_descs,
+                           comp_out,
+                           &stripe,
+                           &streams,
+                           &pbw_);
+      }
 
-    // Column data consisting one or more separate streams
-    for (auto const& strm_desc : strm_descs[stripe_id]) {
-      write_tasks.push_back(
-        write_data_stream(strm_desc,
-                          enc_data.streams[strm_desc.column_id][rowgroups_range.first],
-                          static_cast<uint8_t const*>(compressed_data.data()),
-                          stream_output.get(),
-                          &stripe,
-                          &streams));
-    }
+      // Column data consisting one or more separate streams
+      for (auto const& strm_desc : strm_descs[stripe_id]) {
+        write_tasks.push_back(
+          write_data_stream(strm_desc,
+                            enc_data.streams[strm_desc.column_id][rowgroups_range.first],
+                            static_cast<uint8_t const*>(compressed_data.data()),
+                            stream_output.get(),
+                            &stripe,
+                            &streams));
+      }
 
-    // Write stripefooter consisting of stream information
-    StripeFooter sf;
-    sf.streams = streams;
-    sf.columns.resize(orc_table.num_columns() + 1);
-    sf.columns[0].kind = DIRECT;
-    for (size_t i = 1; i < sf.columns.size(); ++i) {
-      sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
-      sf.columns[i].dictionarySize =
-        (sf.columns[i].kind == DICTIONARY_V2)
-          ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
-          : 0;
-      if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      // Write stripefooter consisting of stream information
+      StripeFooter sf;
+      sf.streams = streams;
+      sf.columns.resize(orc_table.num_columns() + 1);
+      sf.columns[0].kind = DIRECT;
+      for (size_t i = 1; i < sf.columns.size(); ++i) {
+        sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
+        sf.columns[i].dictionarySize =
+          (sf.columns[i].kind == DICTIONARY_V2)
+            ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
+            : 0;
+        if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      }
+      buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
+      pbw_.write(sf);
+      stripe.footerLength = buffer_.size();
+      if (compression_kind_ != NONE) {
+        uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
+        buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
+        buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
+        buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+      }
+      out_sink_->host_write(buffer_.data(), buffer_.size());
     }
-    buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
-    pbw_.write(sf);
-    stripe.footerLength = buffer_.size();
-    if (compression_kind_ != NONE) {
-      uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
-      buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
-      buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
-      buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+    for (auto const& task : write_tasks) {
+      task.wait();
     }
-    out_sink_->host_write(buffer_.data(), buffer_.size());
-  }
-  for (auto const& task : write_tasks) {
-    task.wait();
-  }
 
-  if (column_stats.size() != 0) {
-    // File-level statistics
-    // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
-    if (single_write_mode) {
-      // First entry contains total number of rows
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(num_rows);
-      ff.statistics.reserve(1 + orc_table.num_columns());
-      ff.statistics.emplace_back(std::move(buffer_));
-      // Add file stats, stored after stripe stats in `column_stats`
-      ff.statistics.insert(
-        ff.statistics.end(),
-        std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
-        std::make_move_iterator(column_stats.end()));
-    }
-    // Stripe-level statistics
-    size_t first_stripe = md.stripeStats.size();
-    md.stripeStats.resize(first_stripe + stripes.size());
-    for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
-      md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(stripes[stripe_id].numberOfRows);
-      md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
-      for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
-        size_t idx = stripes.size() * col_idx + stripe_id;
-        if (idx < column_stats.size()) {
-          md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
-            std::move(column_stats[idx]);
+    if (not column_stats.empty()) {
+      // File-level statistics
+      // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
+      if (single_write_mode) {
+        // First entry contains total number of rows
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(num_rows);
+        ff.statistics.reserve(1 + orc_table.num_columns());
+        ff.statistics.emplace_back(std::move(buffer_));
+        // Add file stats, stored after stripe stats in `column_stats`
+        ff.statistics.insert(
+          ff.statistics.end(),
+          std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
+          std::make_move_iterator(column_stats.end()));
+      }
+      // Stripe-level statistics
+      size_t first_stripe = md.stripeStats.size();
+      md.stripeStats.resize(first_stripe + stripes.size());
+      for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
+        md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(stripes[stripe_id].numberOfRows);
+        md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
+        for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
+          size_t idx = stripes.size() * col_idx + stripe_id;
+          if (idx < column_stats.size()) {
+            md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
+              std::move(column_stats[idx]);
+          }
         }
       }
     }
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index ac2e6ba5cfb..1bd4cb3c6f4 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -307,7 +307,7 @@ struct EncColumnChunk {
   statistics_chunk const* stats;  //!< Fragment statistics
   uint32_t bfr_size;              //!< Uncompressed buffer size
   uint32_t compressed_size;       //!< Compressed buffer size
-  uint32_t max_page_data_size;    //!< Max data size (excuding header) of any page in this chunk
+  uint32_t max_page_data_size;    //!< Max data size (excluding header) of any page in this chunk
   uint32_t page_headers_size;     //!< Sum of size of all page headers
   uint32_t start_row;             //!< First row of chunk
   uint32_t num_rows;              //!< Number of rows in chunk
@@ -489,7 +489,7 @@ void InitFragmentStatistics(cudf::detail::device_2dspan<statistics_group> groups
 /**
  * @brief Initialize per-chunk hash maps used for dictionary with sentinel values
  *
- * @param chunks Flat span of chunks to intialize hash maps for
+ * @param chunks Flat span of chunks to initialize hash maps for
  * @param stream CUDA stream to use
  */
 void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index f144a02bc89..28144276066 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -216,7 +216,7 @@ std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
 
   int8_t converted_type = converted;
   if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
-      column_type_id != type_id::DECIMAL32 && column_type_id != type_id::DECIMAL64) {
+      not cudf::is_fixed_point(column_type_id)) {
     converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
   }
   return std::make_tuple(type_width, clock_rate, converted_type);
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 2ab5d7d696b..62803432157 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -342,6 +342,8 @@ struct leaf_schema_fn {
     } else if (std::is_same_v<T, numeric::decimal64>) {
       col_schema.type        = Type::INT64;
       col_schema.stats_dtype = statistics_dtype::dtype_decimal64;
+    } else if (std::is_same_v<T, numeric::decimal128>) {
+      CUDF_FAIL("decimal128 currently not supported for parquet writer");
     } else {
       CUDF_FAIL("Unsupported fixed point type for parquet writer");
     }
@@ -1024,6 +1026,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
+    max_row_group_size{options.get_row_group_size_bytes()},
+    max_row_group_rows{options.get_row_group_size_rows()},
     compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
@@ -1043,6 +1047,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
+    max_row_group_size{options.get_row_group_size_bytes()},
+    max_row_group_rows{options.get_row_group_size_rows()},
     compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
@@ -1146,8 +1152,7 @@ void writer::impl::write(table_view const& table)
   // compression/decompression performance).
   using cudf::io::parquet::gpu::max_page_fragment_size;
 
-  uint32_t num_fragments =
-    (uint32_t)((num_rows + max_page_fragment_size - 1) / max_page_fragment_size);
+  size_type const num_fragments = (num_rows + max_page_fragment_size - 1) / max_page_fragment_size;
   cudf::detail::hostdevice_2dvector<gpu::PageFragment> fragments(
     num_columns, num_fragments, stream);
 
@@ -1160,21 +1165,20 @@ void writer::impl::write(table_view const& table)
     init_page_fragments(fragments, col_desc, num_rows, max_page_fragment_size);
   }
 
-  size_t global_rowgroup_base = md.row_groups.size();
+  auto const global_rowgroup_base = static_cast<size_type>(md.row_groups.size());
 
   // Decide row group boundaries based on uncompressed data size
-  size_t rowgroup_size   = 0;
-  uint32_t num_rowgroups = 0;
-  for (uint32_t f = 0, global_r = global_rowgroup_base, rowgroup_start = 0; f < num_fragments;
-       f++) {
-    size_t fragment_data_size = 0;
+  auto rowgroup_size = 0ul;
+  auto num_rowgroups = 0;
+  for (auto f = 0, global_r = global_rowgroup_base, rowgroup_start = 0; f < num_fragments; f++) {
+    auto fragment_data_size = 0ul;
     // Replace with STL algorithm to transform and sum
     for (auto i = 0; i < num_columns; i++) {
       fragment_data_size += fragments[i][f].fragment_data_size;
     }
     if (f > rowgroup_start &&
-        (rowgroup_size + fragment_data_size > max_rowgroup_size_ ||
-         (f + 1 - rowgroup_start) * max_page_fragment_size > max_rowgroup_rows_)) {
+        (rowgroup_size + fragment_data_size > max_row_group_size ||
+         (f + 1 - rowgroup_start) * max_page_fragment_size > max_row_group_rows)) {
       // update schema
       md.row_groups.resize(md.row_groups.size() + 1);
       md.row_groups[global_r++].num_rows = (f - rowgroup_start) * max_page_fragment_size;
@@ -1202,15 +1206,15 @@ void writer::impl::write(table_view const& table)
     }
   }
   // Initialize row groups and column chunks
-  uint32_t num_chunks = num_rowgroups * num_columns;
+  auto const num_chunks = num_rowgroups * num_columns;
   hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
-  for (uint32_t r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups;
+  for (auto r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups;
        r++, global_r++) {
-    uint32_t fragments_in_chunk = (uint32_t)(
-      (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size);
+    size_type const fragments_in_chunk =
+      (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size;
     md.row_groups[global_r].total_byte_size = 0;
     md.row_groups[global_r].columns.resize(num_columns);
-    for (int i = 0; i < num_columns; i++) {
+    for (auto i = 0; i < num_columns; i++) {
       gpu::EncColumnChunk* ck = &chunks[r][i];
 
       *ck             = {};
@@ -1242,8 +1246,8 @@ void writer::impl::write(table_view const& table)
   }
 
   auto dict_info_owner = build_chunk_dictionaries(chunks, col_desc, num_rows, stream);
-  for (uint32_t rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) {
-    for (int col = 0; col < num_columns; col++) {
+  for (auto rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) {
+    for (auto col = 0; col < num_columns; col++) {
       if (chunks.host_view()[rg][col].use_dictionary) {
         md.row_groups[global_rg].columns[col].meta_data.encodings.push_back(
           Encoding::PLAIN_DICTIONARY);
@@ -1272,16 +1276,16 @@ void writer::impl::write(table_view const& table)
   }
 
   // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
-  std::vector<uint32_t> batch_list;
-  uint32_t num_pages          = 0;
-  size_t max_bytes_in_batch   = 1024 * 1024 * 1024;  // 1GB - TBD: Tune this
-  size_t max_uncomp_bfr_size  = 0;
-  size_t max_comp_bfr_size    = 0;
-  size_t max_chunk_bfr_size   = 0;
-  uint32_t max_pages_in_batch = 0;
-  size_t bytes_in_batch       = 0;
-  size_t comp_bytes_in_batch  = 0;
-  for (uint32_t r = 0, groups_in_batch = 0, pages_in_batch = 0; r <= num_rowgroups; r++) {
+  std::vector<size_type> batch_list;
+  size_type num_pages          = 0;
+  size_t max_bytes_in_batch    = 1024 * 1024 * 1024;  // 1GB - TODO: Tune this
+  size_t max_uncomp_bfr_size   = 0;
+  size_t max_comp_bfr_size     = 0;
+  size_t max_chunk_bfr_size    = 0;
+  size_type max_pages_in_batch = 0;
+  size_t bytes_in_batch        = 0;
+  size_t comp_bytes_in_batch   = 0;
+  for (size_type r = 0, groups_in_batch = 0, pages_in_batch = 0; r <= num_rowgroups; r++) {
     size_t rowgroup_size      = 0;
     size_t comp_rowgroup_size = 0;
     if (r < num_rowgroups) {
@@ -1329,11 +1333,11 @@ void writer::impl::write(table_view const& table)
 
   // This contains stats for both the pages and the rowgroups. TODO: make them separate.
   rmm::device_uvector<statistics_chunk> page_stats(num_stats_bfr, stream);
-  for (uint32_t b = 0, r = 0; b < (uint32_t)batch_list.size(); b++) {
-    uint8_t* bfr   = static_cast<uint8_t*>(uncomp_bfr.data());
-    uint8_t* bfr_c = static_cast<uint8_t*>(comp_bfr.data());
-    for (uint32_t j = 0; j < batch_list[b]; j++, r++) {
-      for (int i = 0; i < num_columns; i++) {
+  for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
+    auto bfr   = static_cast<uint8_t*>(uncomp_bfr.data());
+    auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
+    for (auto j = 0; j < batch_list[b]; j++, r++) {
+      for (auto i = 0; i < num_columns; i++) {
         gpu::EncColumnChunk* ck = &chunks[r][i];
         ck->uncompressed_bfr    = bfr;
         ck->compressed_bfr      = bfr_c;
@@ -1358,14 +1362,15 @@ void writer::impl::write(table_view const& table)
   pinned_buffer<uint8_t> host_bfr{nullptr, cudaFreeHost};
 
   // Encode row groups in batches
-  for (uint32_t b = 0, r = 0, global_r = global_rowgroup_base; b < (uint32_t)batch_list.size();
+  for (auto b = 0, r = 0, global_r = global_rowgroup_base;
+       b < static_cast<size_type>(batch_list.size());
        b++) {
     // Count pages in this batch
-    uint32_t rnext               = r + batch_list[b];
-    uint32_t first_page_in_batch = chunks[r][0].first_page;
-    uint32_t first_page_in_next_batch =
+    auto const rnext               = r + batch_list[b];
+    auto const first_page_in_batch = chunks[r][0].first_page;
+    auto const first_page_in_next_batch =
       (rnext < num_rowgroups) ? chunks[rnext][0].first_page : num_pages;
-    uint32_t pages_in_batch = first_page_in_next_batch - first_page_in_batch;
+    auto const pages_in_batch = first_page_in_next_batch - first_page_in_batch;
     // device_span<gpu::EncPage> batch_pages{pages.data() + first_page_in_batch, }
     encode_pages(
       chunks,
@@ -1512,7 +1517,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::close(std::string const& column_ch
   return _impl->close(column_chunks_file_path);
 }
 
-std::unique_ptr<std::vector<uint8_t>> writer::merge_rowgroup_metadata(
+std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list)
 {
   std::vector<uint8_t> output;
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index c7cdf8effd1..9188218f607 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -56,13 +56,6 @@ using cudf::detail::hostdevice_2dvector;
  * @brief Implementation for parquet writer
  */
 class writer::impl {
-  // Parquet datasets are divided into fixed-size, independent rowgroups
-  static constexpr uint32_t DEFAULT_ROWGROUP_MAXSIZE = 128 * 1024 * 1024;  // 128MB
-  static constexpr uint32_t DEFAULT_ROWGROUP_MAXROWS = 1000000;            // Or at most 1M rows
-
-  // rowgroups are divided into pages
-  static constexpr uint32_t DEFAULT_TARGET_PAGE_SIZE = 512 * 1024;
-
  public:
   /**
    * @brief Constructor with writer options.
@@ -209,9 +202,8 @@ class writer::impl {
   // Cuda stream to be used
   rmm::cuda_stream_view stream = rmm::cuda_stream_default;
 
-  size_t max_rowgroup_size_          = DEFAULT_ROWGROUP_MAXSIZE;
-  size_t max_rowgroup_rows_          = DEFAULT_ROWGROUP_MAXROWS;
-  size_t target_page_size_           = DEFAULT_TARGET_PAGE_SIZE;
+  size_t max_row_group_size          = default_row_group_size_bytes;
+  size_type max_row_group_rows       = default_row_group_size_rows;
   Compression compression_           = Compression::UNCOMPRESSED;
   statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE;
   bool int96_timestamps              = false;
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index a427809c81a..d287b9f2419 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -260,7 +260,7 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour
 
   // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
   // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
-  // would have to follow seperate logic.
+  // would have to follow separate logic.
   multibyte_split_seed_kernel<<<1, 1, 0, stream.value()>>>(  //
     tile_multistates,
     tile_offsets,
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
new file mode 100644
index 00000000000..2c1dc1cc0aa
--- /dev/null
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config_utils.hpp"
+
+#include <cudf/utilities/error.hpp>
+
+#include <cstdlib>
+#include <string>
+
+namespace cudf::io::detail {
+
+std::string getenv_or(std::string const& env_var_name, std::string_view default_val)
+{
+  auto const env_val = std::getenv(env_var_name.c_str());
+  return std::string{(env_val == nullptr) ? default_val : env_val};
+}
+
+namespace cufile_integration {
+
+namespace {
+/**
+ * @brief Defines which cuFile usage to enable.
+ */
+enum class usage_policy : uint8_t { OFF, GDS, ALWAYS };
+
+/**
+ * @brief Get the current usage policy.
+ */
+usage_policy get_env_policy()
+{
+  static auto const env_val = getenv_or("LIBCUDF_CUFILE_POLICY", "GDS");
+  if (env_val == "OFF") return usage_policy::OFF;
+  if (env_val == "GDS") return usage_policy::GDS;
+  if (env_val == "ALWAYS") return usage_policy::ALWAYS;
+  CUDF_FAIL("Invalid LIBCUDF_CUFILE_POLICY value: " + env_val);
+}
+}  // namespace
+
+bool is_always_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
+
+bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_policy::GDS; }
+
+}  // namespace cufile_integration
+
+namespace nvcomp_integration {
+
+namespace {
+/**
+ * @brief Defines which nvCOMP usage to enable.
+ */
+enum class usage_policy : uint8_t { OFF, STABLE, ALWAYS };
+
+/**
+ * @brief Get the current usage policy.
+ */
+usage_policy get_env_policy()
+{
+  static auto const env_val = getenv_or("LIBCUDF_NVCOMP_POLICY", "STABLE");
+  if (env_val == "OFF") return usage_policy::OFF;
+  if (env_val == "STABLE") return usage_policy::STABLE;
+  if (env_val == "ALWAYS") return usage_policy::ALWAYS;
+  CUDF_FAIL("Invalid LIBCUDF_NVCOMP_POLICY value: " + env_val);
+}
+}  // namespace
+
+bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
+
+bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_policy::STABLE; }
+
+}  // namespace nvcomp_integration
+
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/config_utils.hpp
index a1d8e747e44..baa45fef08a 100644
--- a/cpp/src/io/utilities/config_utils.hpp
+++ b/cpp/src/io/utilities/config_utils.hpp
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <cstdlib>
 #include <string>
 
 namespace cudf::io::detail {
@@ -24,44 +23,34 @@ namespace cudf::io::detail {
  * @brief Returns the value of the environment variable, or a default value if the variable is not
  * present.
  */
-inline std::string getenv_or(std::string const& env_var_name, std::string_view default_val)
-{
-  auto const env_val = std::getenv(env_var_name.c_str());
-  return std::string{(env_val == nullptr) ? default_val : env_val};
-}
+std::string getenv_or(std::string const& env_var_name, std::string_view default_val);
 
-namespace nvcomp_integration {
+namespace cufile_integration {
 
-namespace {
 /**
- * @brief Defines which nvCOMP usage to enable.
+ * @brief Returns true if cuFile and its compatibility mode are enabled.
  */
-enum class usage_policy : uint8_t { OFF, STABLE, ALWAYS };
+bool is_always_enabled();
 
 /**
- * @brief Get the current usage policy.
+ * @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled).
  */
-inline usage_policy get_env_policy()
-{
-  static auto const env_val = getenv_or("LIBCUDF_NVCOMP_POLICY", "STABLE");
-  if (env_val == "OFF") return usage_policy::OFF;
-  if (env_val == "ALWAYS") return usage_policy::ALWAYS;
-  return usage_policy::STABLE;
-}
-}  // namespace
+bool is_gds_enabled();
+
+}  // namespace cufile_integration
+
+namespace nvcomp_integration {
 
 /**
  * @brief Returns true if all nvCOMP uses are enabled.
  */
-inline bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
+bool is_all_enabled();
 
 /**
  * @brief Returns true if stable nvCOMP use is enabled.
  */
-inline bool is_stable_enabled()
-{
-  return is_all_enabled() or get_env_policy() == usage_policy::STABLE;
-}
+bool is_stable_enabled();
 
 }  // namespace nvcomp_integration
+
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 7afffaede9e..3de6f35cb0d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -14,15 +14,16 @@
  * limitations under the License.
  */
 
+#include "file_io_utilities.hpp"
+
 #include <cudf/io/datasource.hpp>
+#include <cudf/utilities/error.hpp>
+#include <io/utilities/config_utils.hpp>
 
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
 
-#include <cudf/utilities/error.hpp>
-#include "file_io_utilities.hpp"
-
 namespace cudf {
 namespace io {
 namespace {
@@ -239,7 +240,7 @@ std::unique_ptr<datasource> datasource::create(const std::string& filepath,
                                                size_t size)
 {
 #ifdef CUFILE_FOUND
-  if (detail::cufile_config::instance()->is_required()) {
+  if (detail::cufile_integration::is_always_enabled()) {
     // avoid mmap as GDS is expected to be used for most reads
     return std::make_unique<direct_read_source>(filepath.c_str());
   }
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 387452e171a..d96bf93d10f 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -51,45 +51,14 @@ file_wrapper::~file_wrapper() { close(fd); }
 
 #ifdef CUFILE_FOUND
 
-cufile_config::cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)}
-{
-  if (is_enabled()) {
-    // Modify the config file based on the policy
-    auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
-    std::ifstream user_config_file(config_file_path);
-    // Modified config file is stored in a temporary directory
-    auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
-    std::ofstream cudf_config_file(cudf_config_path);
-
-    std::string line;
-    while (std::getline(user_config_file, line)) {
-      std::string const tag = "\"allow_compat_mode\"";
-      if (line.find(tag) != std::string::npos) {
-        // TODO: only replace the true/false value
-        // Enable compatiblity mode when cuDF does not fall back to host path
-        cudf_config_file << tag << ": " << (is_required() ? "true" : "false") << ",\n";
-      } else {
-        cudf_config_file << line << '\n';
-      }
-
-      // Point libcufile to the modified config file
-      CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0,
-                   "Failed to set the cuFile config file environment variable.");
-    }
-  }
-}
-cufile_config const* cufile_config::instance()
-{
-  static cufile_config _instance;
-  return &_instance;
-}
-
 /**
  * @brief Class that dynamically loads the cuFile library and manages the cuFile driver.
  */
 class cufile_shim {
  private:
   cufile_shim();
+  void modify_cufile_json() const;
+  void load_cufile_lib();
 
   void* cf_lib                              = nullptr;
   decltype(cuFileDriverOpen)* driver_open   = nullptr;
@@ -106,8 +75,8 @@ class cufile_shim {
 
   ~cufile_shim()
   {
-    driver_close();
-    dlclose(cf_lib);
+    if (driver_close) driver_close();
+    if (cf_lib) dlclose(cf_lib);
   }
 
   decltype(cuFileHandleRegister)* handle_register     = nullptr;
@@ -116,25 +85,60 @@ class cufile_shim {
   decltype(cuFileWrite)* write                        = nullptr;
 };
 
+void cufile_shim::modify_cufile_json() const
+{
+  std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
+  temp_directory tmp_config_dir{"cudf_cufile_config"};
+
+  // Modify the config file based on the policy
+  auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
+  std::ifstream user_config_file(config_file_path);
+  // Modified config file is stored in a temporary directory
+  auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
+  std::ofstream cudf_config_file(cudf_config_path);
+
+  std::string line;
+  while (std::getline(user_config_file, line)) {
+    std::string const tag = "\"allow_compat_mode\"";
+    if (line.find(tag) != std::string::npos) {
+      // TODO: only replace the true/false value instead of replacing the whole line
+      // Enable compatibility mode when cuDF does not fall back to host path
+      cudf_config_file << tag << ": "
+                       << (cufile_integration::is_always_enabled() ? "true" : "false") << ",\n";
+    } else {
+      cudf_config_file << line << '\n';
+    }
+
+    // Point libcufile to the modified config file
+    CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0,
+                 "Failed to set the cuFile config file environment variable.");
+  }
+}
+
+void cufile_shim::load_cufile_lib()
+{
+  cf_lib      = dlopen("libcufile.so", RTLD_NOW);
+  driver_open = reinterpret_cast<decltype(driver_open)>(dlsym(cf_lib, "cuFileDriverOpen"));
+  CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol");
+  driver_close = reinterpret_cast<decltype(driver_close)>(dlsym(cf_lib, "cuFileDriverClose"));
+  CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile cuFileDriverClose symbol");
+  handle_register =
+    reinterpret_cast<decltype(handle_register)>(dlsym(cf_lib, "cuFileHandleRegister"));
+  CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile cuFileHandleRegister symbol");
+  handle_deregister =
+    reinterpret_cast<decltype(handle_deregister)>(dlsym(cf_lib, "cuFileHandleDeregister"));
+  CUDF_EXPECTS(handle_deregister != nullptr, "could not find cuFile cuFileHandleDeregister symbol");
+  read = reinterpret_cast<decltype(read)>(dlsym(cf_lib, "cuFileRead"));
+  CUDF_EXPECTS(read != nullptr, "could not find cuFile cuFileRead symbol");
+  write = reinterpret_cast<decltype(write)>(dlsym(cf_lib, "cuFileWrite"));
+  CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol");
+}
+
 cufile_shim::cufile_shim()
 {
   try {
-    cf_lib      = dlopen("libcufile.so", RTLD_NOW);
-    driver_open = reinterpret_cast<decltype(driver_open)>(dlsym(cf_lib, "cuFileDriverOpen"));
-    CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol");
-    driver_close = reinterpret_cast<decltype(driver_close)>(dlsym(cf_lib, "cuFileDriverClose"));
-    CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile cuFileDriverClose symbol");
-    handle_register =
-      reinterpret_cast<decltype(handle_register)>(dlsym(cf_lib, "cuFileHandleRegister"));
-    CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile cuFileHandleRegister symbol");
-    handle_deregister =
-      reinterpret_cast<decltype(handle_deregister)>(dlsym(cf_lib, "cuFileHandleDeregister"));
-    CUDF_EXPECTS(handle_deregister != nullptr,
-                 "could not find cuFile cuFileHandleDeregister symbol");
-    read = reinterpret_cast<decltype(read)>(dlsym(cf_lib, "cuFileRead"));
-    CUDF_EXPECTS(read != nullptr, "could not find cuFile cuFileRead symbol");
-    write = reinterpret_cast<decltype(write)>(dlsym(cf_lib, "cuFileWrite"));
-    CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol");
+    modify_cufile_json();
+    load_cufile_lib();
 
     CUDF_EXPECTS(driver_open().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver");
   } catch (cudf::logic_error const& err) {
@@ -285,11 +289,11 @@ std::future<void> cufile_output_impl::write_async(void const* data, size_t offse
 std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath)
 {
 #ifdef CUFILE_FOUND
-  if (cufile_config::instance()->is_enabled()) {
+  if (cufile_integration::is_gds_enabled()) {
     try {
       return std::make_unique<cufile_input_impl>(filepath);
     } catch (...) {
-      if (cufile_config::instance()->is_required()) throw;
+      if (cufile_integration::is_always_enabled()) throw;
     }
   }
 #endif
@@ -299,11 +303,11 @@ std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath
 std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepath)
 {
 #ifdef CUFILE_FOUND
-  if (cufile_config::instance()->is_enabled()) {
+  if (cufile_integration::is_gds_enabled()) {
     try {
       return std::make_unique<cufile_output_impl>(filepath);
     } catch (...) {
-      if (cufile_config::instance()->is_required()) throw;
+      if (cufile_integration::is_always_enabled()) throw;
     }
   }
 #endif
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 0119f9b7abd..ede0eb6f925 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -162,32 +162,6 @@ class cufile_output : public cufile_io_base {
 
 class cufile_shim;
 
-/**
- * @brief Class that manages cuFile configuration.
- */
-class cufile_config {
-  std::string const default_policy    = "OFF";
-  std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
-
-  std::string const policy = default_policy;
-  temp_directory tmp_config_dir{"cudf_cufile_config"};
-
-  cufile_config();
-
- public:
-  /**
-   * @brief Returns true when cuFile use is enabled.
-   */
-  bool is_enabled() const { return policy == "ALWAYS" or policy == "GDS"; }
-
-  /**
-   * @brief Returns true when cuDF should not fall back to host IO.
-   */
-  bool is_required() const { return policy == "ALWAYS"; }
-
-  static cufile_config const* instance();
-};
-
 /**
  * @brief Class that provides RAII for cuFile file registration.
  */
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 283715478a0..a7f9aec7bb4 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -179,6 +179,7 @@ class hostdevice_2dvector {
 
   auto size() const noexcept { return _size; }
   auto count() const noexcept { return _size.first * _size.second; }
+  auto is_empty() const noexcept { return count() == 0; }
 
   T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); }
   T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); }
diff --git a/cpp/src/jit/type.cpp b/cpp/src/jit/type.cpp
index 16894168b31..cf91932ca19 100644
--- a/cpp/src/jit/type.cpp
+++ b/cpp/src/jit/type.cpp
@@ -76,6 +76,7 @@ std::string get_type_name(data_type type)
     case type_id::STRUCT: return CUDF_STRINGIFY(Struct);
     case type_id::DECIMAL32: return CUDF_STRINGIFY(int32_t);
     case type_id::DECIMAL64: return CUDF_STRINGIFY(int64_t);
+    case type_id::DECIMAL128: return CUDF_STRINGIFY(__int128_t);
 
     default: break;
   }
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index a540dde1041..421761b341b 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -266,7 +266,7 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
     left_join_complement_size = thrust::count_if(rmm::exec_policy(stream),
                                                  invalid_index_map->begin(),
                                                  invalid_index_map->end(),
-                                                 thrust::identity<size_type>());
+                                                 thrust::identity());
   }
   return join_size + left_join_complement_size;
 }
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
index 4aca4b4a9cf..9e98f87e7f0 100644
--- a/cpp/src/join/join_utils.cu
+++ b/cpp/src/join/join_utils.cu
@@ -136,7 +136,7 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
                                               thrust::make_counting_iterator(end_counter),
                                               invalid_index_map->begin(),
                                               right_indices_complement->begin(),
-                                              thrust::identity<size_type>()) -
+                                              thrust::identity{}) -
                               right_indices_complement->begin();
     right_indices_complement->resize(indices_count, stream);
   }
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 4bef312b396..2ddede97ce4 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -225,7 +225,7 @@ std::unique_ptr<column> concatenate_lists_nullifying_rows(column_view const& inp
   auto list_entries =
     gather_list_entries(input, offsets_view, num_rows, num_output_entries, stream, mr);
   auto [null_mask, null_count] = cudf::detail::valid_if(
-    list_validities.begin(), list_validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr);
 
   return make_lists_column(num_rows,
                            std::move(list_offsets),
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index bdbc9ae013c..b48982d205a 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -74,11 +74,8 @@ struct lookup_functor {
     if (!search_keys_have_nulls && !input_lists.has_nulls() && !input_lists.child().has_nulls()) {
       return {rmm::device_buffer{0, stream, mr}, size_type{0}};
     } else {
-      return cudf::detail::valid_if(result_validity.begin<bool>(),
-                                    result_validity.end<bool>(),
-                                    thrust::identity<bool>{},
-                                    stream,
-                                    mr);
+      return cudf::detail::valid_if(
+        result_validity.begin<bool>(), result_validity.end<bool>(), thrust::identity{}, stream, mr);
     }
   }
 
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 8cbcddc1c58..41187b96cdb 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/detail/copy_range.cuh>
 #include <cudf/detail/gather.cuh>
-#include <cudf/detail/gather.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -88,14 +88,15 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
   auto child       = std::move(child_table->release().front());
 
   // Create list offsets from gather_map.
-  auto output_offset = cudf::allocate_like(
-    gather_map.offsets(), gather_map.size() + 1, mask_allocation_policy::RETAIN, mr);
+  auto output_offset = cudf::detail::allocate_like(
+    gather_map.offsets(), gather_map.size() + 1, mask_allocation_policy::RETAIN, stream, mr);
   auto output_offset_view = output_offset->mutable_view();
-  cudf::copy_range_in_place(gather_map.offsets(),
-                            output_offset_view,
-                            gather_map.offset(),
-                            gather_map.offset() + output_offset_view.size(),
-                            0);
+  cudf::detail::copy_range_in_place(gather_map.offsets(),
+                                    output_offset_view,
+                                    gather_map.offset(),
+                                    gather_map.offset() + output_offset_view.size(),
+                                    0,
+                                    stream);
   // Assemble list column & return
   auto null_mask       = cudf::detail::copy_bitmask(value_column.parent(), stream, mr);
   size_type null_count = value_column.null_count();
@@ -103,7 +104,9 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
                            std::move(output_offset),
                            std::move(child),
                            null_count,
-                           std::move(null_mask));
+                           std::move(null_mask),
+                           stream,
+                           mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu
index 0663bc18ab3..527e834c76c 100644
--- a/cpp/src/lists/drop_list_duplicates.cu
+++ b/cpp/src/lists/drop_list_duplicates.cu
@@ -67,7 +67,7 @@ struct has_negative_nans_fn {
  * @brief A structure to be used along with type_dispatcher to check if a column has any
  * negative NaN value.
  *
- * This functor is neccessary because when calling to segmented sort on the list entries, the
+ * This functor is necessary because when calling to segmented sort on the list entries, the
  * negative NaN and positive NaN values (if both exist) are separated to the two ends of the output
  * lists. We want to move all NaN values close together in order to call unique_copy later on.
  */
@@ -563,7 +563,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates
                      values ? cudf::empty_like(values.value().parent()) : nullptr};
   }
 
-  // The child column conotaining list entries.
+  // The child column containing list entries.
   auto const keys_child = keys.get_sliced_child(stream);
 
   // Generate a mapping from list entries to their 1-based list indices for the keys column.
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index 381864e1a68..7c6c612eb25 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -53,7 +53,7 @@ std::unique_ptr<cudf::column> make_index_child(column_view const& indices,
   // `segmented_gather()` on a null index should produce a null row.
   if (not indices.nullable()) { return std::make_unique<column>(indices, stream); }
 
-  auto const d_indices = column_device_view::create(indices);
+  auto const d_indices = column_device_view::create(indices, stream);
   // Replace null indices with MAX_SIZE_TYPE, so that gather() returns null for them.
   auto const null_replaced_iter_begin =
     cudf::detail::make_null_replacement_iterator(*d_indices, std::numeric_limits<size_type>::max());
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index b9b73d98ed2..220cb25a942 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -228,8 +228,8 @@ struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::
     auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
       comp_fn, num_output_lists, num_output_entries, stream, mr);
 
-    auto [null_mask, null_count] = cudf::detail::valid_if(
-      validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    auto [null_mask, null_count] =
+      cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
 
     return make_strings_column(num_output_entries,
                                std::move(offsets_column),
@@ -306,7 +306,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<cudf::is_fixed_width<T>(
 
     if (data_has_null_mask) {
       auto [null_mask, null_count] = cudf::detail::valid_if(
-        validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+        validities.begin(), validities.end(), thrust::identity{}, stream, mr);
       if (null_count > 0) { output->set_null_mask(null_mask, null_count); }
     }
 
@@ -405,7 +405,7 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
   }
 
   auto [null_mask, null_count] = cudf::detail::valid_if(
-    list_validities.begin(), list_validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr);
   return make_lists_column(num_output_lists,
                            std::move(list_offsets),
                            std::move(list_entries),
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index def4a400488..a0554833def 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -18,6 +18,7 @@
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 57c221b15ed..18e7d02d086 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -348,11 +348,8 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
     if (null_count == 0) {
       return std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, null_count};
     }
-    return cudf::detail::valid_if(tdigest_is_empty,
-                                  tdigest_is_empty + tdv.size(),
-                                  thrust::logical_not<size_type>{},
-                                  stream,
-                                  mr);
+    return cudf::detail::valid_if(
+      tdigest_is_empty, tdigest_is_empty + tdv.size(), thrust::logical_not{}, stream, mr);
   }();
 
   return cudf::make_lists_column(
diff --git a/cpp/src/groupby/sort/group_util.cuh b/cpp/src/reductions/arg_minmax_util.cuh
similarity index 98%
rename from cpp/src/groupby/sort/group_util.cuh
rename to cpp/src/reductions/arg_minmax_util.cuh
index 31ff29ed4c3..40df23bcd8e 100644
--- a/cpp/src/groupby/sort/group_util.cuh
+++ b/cpp/src/reductions/arg_minmax_util.cuh
@@ -19,7 +19,7 @@
 #include <cudf/table/row_operators.cuh>
 
 namespace cudf {
-namespace groupby {
+namespace reduction {
 namespace detail {
 
 /**
@@ -62,5 +62,5 @@ struct row_arg_minmax_fn {
 };
 
 }  // namespace detail
-}  // namespace groupby
+}  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
index 5eeb6a1deb5..84387aba914 100644
--- a/cpp/src/reductions/scan/scan.cuh
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -53,7 +53,7 @@ std::unique_ptr<column> scan_agg_dispatch(const column_view& input,
     case aggregation::PRODUCT:
       // a product scan on a decimal type with non-zero scale would result in each element having
       // a different scale, and because scale is stored once per column, this is not possible
-      if (is_fixed_point(input.type())) CUDF_FAIL("decimal32/64 cannot support product scan");
+      if (is_fixed_point(input.type())) CUDF_FAIL("decimal32/64/128 cannot support product scan");
       return type_dispatcher<dispatch_storage_type>(
         input.type(), DispatchFn<DeviceProduct>(), input, null_handling, stream, mr);
     default: CUDF_FAIL("Unsupported aggregation operator for scan");
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 383b64d45a1..3cae782d18f 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -50,7 +50,7 @@ struct scan_dispatcher {
    * @param mr Device memory resource used to allocate the returned column's device memory
    * @return Output column with scan results
    */
-  template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  template <typename T, typename std::enable_if_t<cuda::std::is_arithmetic<T>::value>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      null_policy,
                                      rmm::cuda_stream_view stream,
@@ -72,7 +72,8 @@ struct scan_dispatcher {
   }
 
   template <typename T, typename... Args>
-  std::enable_if_t<!std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(Args&&...)
+  std::enable_if_t<not cuda::std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
+    Args&&...)
   {
     CUDF_FAIL("Non-arithmetic types not supported for exclusive scan");
   }
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 02ecd6df4d9..b0e761c4c3b 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -14,13 +14,17 @@
  * limitations under the License.
  */
 
-#include "scan.cuh"
+#include <reductions/arg_minmax_util.cuh>
+#include <reductions/scan/scan.cuh>
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/reduction.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -46,11 +50,10 @@ rmm::device_buffer mask_scan(column_view const& input_view,
   auto valid_itr = detail::make_validity_iterator(*d_input);
 
   auto first_null_position = [&] {
-    size_type const first_null = thrust::find_if_not(rmm::exec_policy(stream),
-                                                     valid_itr,
-                                                     valid_itr + input_view.size(),
-                                                     thrust::identity<bool>{}) -
-                                 valid_itr;
+    size_type const first_null =
+      thrust::find_if_not(
+        rmm::exec_policy(stream), valid_itr, valid_itr + input_view.size(), thrust::identity{}) -
+      valid_itr;
     size_type const exclusive_offset = (inclusive == scan_type::EXCLUSIVE) ? 1 : 0;
     return std::min(input_view.size(), first_null + exclusive_offset);
   }();
@@ -150,6 +153,72 @@ struct scan_functor<Op, cudf::string_view> {
   }
 };
 
+template <typename Op>
+struct scan_functor<Op, cudf::struct_view> {
+  static std::unique_ptr<column> invoke(column_view const& input,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+  {
+    // Op is used only to determined if we want to find the min or max element.
+    auto constexpr is_min_op = std::is_same_v<Op, DeviceMin>;
+
+    // Build indices of the scan operation results (ARGMIN/ARGMAX).
+    // When finding ARGMIN, we need to consider nulls as larger than non-null elements, and the
+    // opposite for ARGMAX.
+    auto gather_map    = rmm::device_uvector<size_type>(input.size(), stream);
+    auto const do_scan = [&](auto const& binop) {
+      thrust::inclusive_scan(rmm::exec_policy(stream),
+                             thrust::counting_iterator<size_type>(0),
+                             thrust::counting_iterator<size_type>(input.size()),
+                             gather_map.begin(),
+                             binop);
+    };
+
+    auto constexpr null_precedence = is_min_op ? cudf::null_order::AFTER : cudf::null_order::BEFORE;
+    auto const flattened_input     = cudf::structs::detail::flatten_nested_columns(
+      table_view{{input}}, {}, std::vector<null_order>{null_precedence});
+    auto const d_flattened_input_ptr = table_device_view::create(flattened_input, stream);
+    auto const flattened_null_precedences =
+      is_min_op ? cudf::detail::make_device_uvector_async(flattened_input.null_orders(), stream)
+                : rmm::device_uvector<cudf::null_order>(0, stream);
+
+    if (input.has_nulls()) {
+      auto const binop = cudf::reduction::detail::row_arg_minmax_fn<true>(
+        input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+      do_scan(binop);
+    } else {
+      auto const binop = cudf::reduction::detail::row_arg_minmax_fn<false>(
+        input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+      do_scan(binop);
+    }
+
+    // Gather the children columns of the input column. Must use `get_sliced_child` to properly
+    // handle input in case it is a sliced view.
+    auto const input_children = [&] {
+      auto const it = cudf::detail::make_counting_transform_iterator(
+        0, [structs_view = structs_column_view{input}, stream](auto const child_idx) {
+          return structs_view.get_sliced_child(child_idx);
+        });
+      return std::vector<column_view>(it, it + input.num_children());
+    }();
+
+    // Gather the children elements of the prefix min/max struct elements for the output.
+    auto scanned_children = cudf::detail::gather(table_view{input_children},
+                                                 gather_map,
+                                                 out_of_bounds_policy::DONT_CHECK,
+                                                 negative_index_policy::NOT_ALLOWED,
+                                                 stream,
+                                                 mr)
+                              ->release();
+
+    // Don't need to set a null mask because that will be handled at the caller.
+    return make_structs_column(input.size(),
+                               std::move(scanned_children),
+                               UNKNOWN_NULL_COUNT,
+                               rmm::device_buffer{0, stream, mr});
+  }
+};
+
 /**
  * @brief Dispatcher for running a Scan operation on an input column
  *
@@ -161,7 +230,11 @@ struct scan_dispatcher {
   template <typename T>
   static constexpr bool is_supported()
   {
-    return std::is_invocable_v<Op, T, T> && !cudf::is_dictionary<T>();
+    if constexpr (std::is_same_v<T, cudf::struct_view>) {
+      return std::is_same_v<Op, DeviceMin> || std::is_same_v<Op, DeviceMax>;
+    } else {
+      return std::is_invocable_v<Op, T, T> && !cudf::is_dictionary<T>();
+    }
   }
 
  public:
@@ -209,6 +282,15 @@ std::unique_ptr<column> scan_inclusive(
     output->set_null_mask(mask_scan(input, scan_type::INCLUSIVE, stream, mr), UNKNOWN_NULL_COUNT);
   }
 
+  // If the input is a structs column, we also need to push down nulls from the parent output column
+  // into the children columns.
+  if (input.type().id() == type_id::STRUCT && output->has_nulls()) {
+    for (size_type idx = 0; idx < output->num_children(); ++idx) {
+      structs::detail::superimpose_parent_nulls(
+        output->view().null_mask(), output->null_count(), output->child(idx), stream, mr);
+    }
+  }
+
   return output;
 }
 }  // namespace detail
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index d83ad91d89b..7dd54e9250a 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -16,9 +16,13 @@
 
 #pragma once
 
+#include <reductions/arg_minmax_util.cuh>
+
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/reduction.cuh>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
@@ -28,6 +32,9 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/reduce.h>
 
 namespace cudf {
 namespace reduction {
@@ -74,7 +81,7 @@ std::unique_ptr<scalar> simple_reduction(column_view const& col,
 /**
  * @brief Reduction for `sum`, `product`, `min` and `max` for decimal types
  *
- * @tparam DecimalXX  The `decimal32` or `decimal64` type
+ * @tparam DecimalXX  The `decimal32`, `decimal64` or `decimal128` type
  * @tparam Op         The operator of cudf::reduction::op::
  *
  * @param col         Input column of data to reduce
@@ -252,8 +259,7 @@ struct same_element_type_dispatcher {
   template <typename ElementType>
   static constexpr bool is_supported()
   {
-    return !(cudf::is_dictionary<ElementType>() || std::is_same_v<ElementType, cudf::list_view> ||
-             std::is_same_v<ElementType, cudf::struct_view>);
+    return !(cudf::is_dictionary<ElementType>() || std::is_same_v<ElementType, cudf::list_view>);
   }
 
   template <typename IndexType,
@@ -279,8 +285,55 @@ struct same_element_type_dispatcher {
 
  public:
   template <typename ElementType,
-            std::enable_if_t<is_supported<ElementType>() &&
-                             not cudf::is_fixed_point<ElementType>()>* = nullptr>
+            std::enable_if_t<std::is_same_v<ElementType, cudf::struct_view> &&
+                             (std::is_same_v<Op, cudf::reduction::op::min> ||
+                              std::is_same_v<Op, cudf::reduction::op::max>)>* = nullptr>
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    if (input.is_empty()) { return cudf::make_empty_scalar_like(input, stream, mr); }
+
+    auto constexpr is_min_op = std::is_same_v<Op, cudf::reduction::op::min>;
+
+    // We will do reduction to find the ARGMIN/ARGMAX index, then return the element at that index.
+    // When finding ARGMIN, we need to consider nulls as larger than non-null elements, and the
+    // opposite for ARGMAX.
+    auto constexpr null_precedence = is_min_op ? cudf::null_order::AFTER : cudf::null_order::BEFORE;
+    auto const flattened_input     = cudf::structs::detail::flatten_nested_columns(
+      table_view{{input}}, {}, std::vector<null_order>{null_precedence});
+    auto const d_flattened_input_ptr = table_device_view::create(flattened_input, stream);
+    auto const flattened_null_precedences =
+      is_min_op ? cudf::detail::make_device_uvector_async(flattened_input.null_orders(), stream)
+                : rmm::device_uvector<cudf::null_order>(0, stream);
+
+    // Perform reduction to find ARGMIN/ARGMAX.
+    auto const do_reduction = [&](auto const& binop) {
+      return thrust::reduce(rmm::exec_policy(stream),
+                            thrust::make_counting_iterator(0),
+                            thrust::make_counting_iterator(input.size()),
+                            size_type{0},
+                            binop);
+    };
+
+    auto const minmax_idx = [&] {
+      if (input.has_nulls()) {
+        auto const binop = cudf::reduction::detail::row_arg_minmax_fn<true>(
+          input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+        return do_reduction(binop);
+      } else {
+        auto const binop = cudf::reduction::detail::row_arg_minmax_fn<false>(
+          input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+        return do_reduction(binop);
+      }
+    }();
+
+    return cudf::detail::get_element(input, minmax_idx, stream, mr);
+  }
+
+  template <typename ElementType,
+            std::enable_if_t<is_supported<ElementType>() && !cudf::is_fixed_point<ElementType>() &&
+                             !std::is_same_v<ElementType, cudf::struct_view>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
@@ -395,7 +448,7 @@ struct element_type_dispatcher {
   }
 
   /**
-   * @brief Specialization for reducing integer column types to any output type.
+   * @brief Specialization for reducing fixed_point column types to fixed_point number
    */
   template <typename ElementType,
             typename std::enable_if_t<cudf::is_fixed_point<ElementType>()>* = nullptr>
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 509f67bb5c6..5a7f15148d8 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -142,8 +142,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                preceding_window] __device__(size_type idx) {
     auto group_label = d_group_labels[idx];
     auto group_start = d_group_offsets[group_label];
-    return thrust::minimum<size_type>{}(preceding_window,
-                                        idx - group_start + 1);  // Preceding includes current row.
+    return thrust::minimum{}(preceding_window,
+                             idx - group_start + 1);  // Preceding includes current row.
   };
 
   auto following_calculator = [d_group_offsets = group_offsets.data(),
@@ -152,7 +152,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
     auto group_label = d_group_labels[idx];
     auto group_end   = d_group_offsets[group_label + 1];  // Cannot fall off the end, since offsets
                                                           // is capped with `input.size()`.
-    return thrust::minimum<size_type>{}(following_window, (group_end - 1) - idx);
+    return thrust::minimum{}(following_window, (group_end - 1) - idx);
   };
 
   if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) {
diff --git a/cpp/src/rolling/rolling_collect_list.cu b/cpp/src/rolling/rolling_collect_list.cu
index ecef90dc8e1..30c39bde7d2 100644
--- a/cpp/src/rolling/rolling_collect_list.cu
+++ b/cpp/src/rolling/rolling_collect_list.cu
@@ -75,7 +75,7 @@ std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view con
                          per_row_mapping_begin,
                          per_row_mapping_begin + num_child_rows,
                          per_row_mapping_begin,
-                         thrust::maximum<size_type>{});
+                         thrust::maximum{});
   return per_row_mapping;
 }
 
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index 12227404d83..bc1947dfeed 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -722,7 +722,7 @@ class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggre
   }
 
   // STD aggregations depends on VARIANCE aggregation. Each element is applied
-  // with sqaured-root in the finalize() step.
+  // with square-root in the finalize() step.
   std::vector<std::unique_ptr<aggregation>> visit(data_type,
                                                   cudf::detail::std_aggregation const& agg) override
   {
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 24f55d67a72..36dd2dabd72 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/round.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/fixed_point/temporary.hpp>
 #include <cudf/round.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -46,26 +47,26 @@ inline double __device__ generic_round_half_even(double d) { return rint(d); }
 inline float __device__ generic_modf(float a, float* b) { return modff(a, b); }
 inline double __device__ generic_modf(double a, double* b) { return modf(a, b); }
 
-template <typename T, typename std::enable_if_t<std::is_signed<T>::value>* = nullptr>
+template <typename T, typename std::enable_if_t<cuda::std::is_signed<T>::value>* = nullptr>
 T __device__ generic_abs(T value)
 {
-  return abs(value);
+  return numeric::detail::abs(value);
 }
 
-template <typename T, typename std::enable_if_t<not std::is_signed<T>::value>* = nullptr>
+template <typename T, typename std::enable_if_t<not cuda::std::is_signed<T>::value>* = nullptr>
 T __device__ generic_abs(T value)
 {
   return value;
 }
 
-template <typename T, typename std::enable_if_t<std::is_signed<T>::value>* = nullptr>
+template <typename T, typename std::enable_if_t<cuda::std::is_signed<T>::value>* = nullptr>
 int16_t __device__ generic_sign(T value)
 {
   return value < 0 ? -1 : 1;
 }
 
 // this is needed to suppress warning: pointless comparison of unsigned integer with zero
-template <typename T, typename std::enable_if_t<not std::is_signed<T>::value>* = nullptr>
+template <typename T, typename std::enable_if_t<not cuda::std::is_signed<T>::value>* = nullptr>
 int16_t __device__ generic_sign(T)
 {
   return 1;
@@ -86,7 +87,7 @@ struct half_up_zero {
     return generic_round(e);
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -105,7 +106,7 @@ struct half_up_positive {
     return integer_part + generic_round(fractional_part * n) / n;
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -122,7 +123,7 @@ struct half_up_negative {
     return generic_round(e / n) * n;
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down = (e / n) * n;  // result from rounding down
@@ -139,7 +140,7 @@ struct half_even_zero {
     return generic_round_half_even(e);
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -158,7 +159,7 @@ struct half_even_positive {
     return integer_part + generic_round_half_even(fractional_part * n) / n;
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -175,7 +176,7 @@ struct half_even_negative {
     return generic_round_half_even(e / n) * n;
   }
 
-  template <typename U = T, typename std::enable_if_t<std::is_integral<U>::value>* = nullptr>
+  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down_over_n = e / n;            // use this to determine HALF_EVEN case
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 5b7abdfcaf0..4f6774be184 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -208,6 +208,7 @@ typename fixed_point_scalar<T>::rep_type const* fixed_point_scalar<T>::data() co
  */
 template class fixed_point_scalar<numeric::decimal32>;
 template class fixed_point_scalar<numeric::decimal64>;
+template class fixed_point_scalar<numeric::decimal128>;
 
 namespace detail {
 
@@ -281,6 +282,7 @@ template class fixed_width_scalar<int8_t>;
 template class fixed_width_scalar<int16_t>;
 template class fixed_width_scalar<int32_t>;
 template class fixed_width_scalar<int64_t>;
+template class fixed_width_scalar<__int128_t>;
 template class fixed_width_scalar<uint8_t>;
 template class fixed_width_scalar<uint16_t>;
 template class fixed_width_scalar<uint32_t>;
@@ -339,6 +341,7 @@ template class numeric_scalar<int8_t>;
 template class numeric_scalar<int16_t>;
 template class numeric_scalar<int32_t>;
 template class numeric_scalar<int64_t>;
+template class numeric_scalar<__int128_t>;
 template class numeric_scalar<uint8_t>;
 template class numeric_scalar<uint16_t>;
 template class numeric_scalar<uint32_t>;
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index d2876435780..c18b57d220f 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf/detail/copy.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -184,10 +185,12 @@ std::unique_ptr<scalar> make_empty_scalar_like(column_view const& column,
 {
   std::unique_ptr<scalar> result;
   switch (column.type().id()) {
-    case type_id::LIST:
-      result = make_list_scalar(empty_like(column)->view(), stream, mr);
+    case type_id::LIST: {
+      auto const empty_child = empty_like(lists_column_view(column).child());
+      result                 = make_list_scalar(empty_child->view(), stream, mr);
       result->set_valid_async(false, stream);
       break;
+    }
     case type_id::STRUCT:
       // The input column must have at least 1 row to extract a scalar (row) from it.
       result = detail::get_element(column, 0, stream, mr);
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index c8a908e44cd..de0a44e3234 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -117,7 +117,7 @@ void tie_break_ranks_transform(cudf::device_span<size_type const> dense_rank_sor
                         tie_iter,
                         thrust::make_discard_iterator(),
                         tie_sorted.begin(),
-                        thrust::equal_to<size_type>{},
+                        thrust::equal_to{},
                         tie_breaker);
   auto sorted_tied_rank = thrust::make_transform_iterator(
     dense_rank_sorted.begin(),
@@ -171,8 +171,8 @@ void rank_min(cudf::device_span<size_type const> group_keys,
                                        thrust::make_counting_iterator<size_type>(1),
                                        sorted_order_view,
                                        rank_mutable_view.begin<outputType>(),
-                                       thrust::minimum<size_type>{},
-                                       thrust::identity<outputType>{},
+                                       thrust::minimum{},
+                                       thrust::identity{},
                                        stream);
 }
 
@@ -189,11 +189,17 @@ void rank_max(cudf::device_span<size_type const> group_keys,
                                        thrust::make_counting_iterator<size_type>(1),
                                        sorted_order_view,
                                        rank_mutable_view.begin<outputType>(),
-                                       thrust::maximum<size_type>{},
-                                       thrust::identity<outputType>{},
+                                       thrust::maximum{},
+                                       thrust::identity{},
                                        stream);
 }
 
+// Returns index, count
+template <typename T>
+struct index_counter {
+  __device__ T operator()(size_type i) { return T{i, 1}; }
+};
+
 void rank_average(cudf::device_span<size_type const> group_keys,
                   column_view sorted_order_view,
                   mutable_column_view rank_mutable_view,
@@ -208,10 +214,9 @@ void rank_average(cudf::device_span<size_type const> group_keys,
   using MinCount = thrust::pair<size_type, size_type>;
   tie_break_ranks_transform<MinCount>(
     group_keys,
-    cudf::detail::make_counting_transform_iterator(1,
-                                                   [] __device__(auto i) {
-                                                     return MinCount{i, 1};
-                                                   }),
+    // Use device functor with return type. Cannot use device lambda due to limitation.
+    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda-restrictions
+    cudf::detail::make_counting_transform_iterator(1, index_counter<MinCount>{}),
     sorted_order_view,
     rank_mutable_view.begin<double>(),
     [] __device__(auto rank_count1, auto rank_count2) {
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 51a6a796ba3..8d0c5704a7b 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -707,9 +707,9 @@ struct from_timestamp_base {
    *     scale( 61,60) ->  1
    * @endcode
    */
-  __device__ int32_t scale_time(int64_t time, int64_t base) const
+  __device__ int64_t scale_time(int64_t time, int64_t base) const
   {
-    return static_cast<int32_t>((time - ((time < 0) * (base - 1L))) / base);
+    return (time - ((time < 0) * (base - 1L))) / base;
   };
 
   __device__ time_components get_time_components(int64_t tstamp) const
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index c29aa6560e9..6944a8eb097 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -37,6 +37,9 @@
 #include <thrust/optional.h>
 #include <thrust/transform.h>
 
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -83,7 +86,7 @@ struct string_to_decimal_check_fn {
   int32_t const scale;
 
   string_to_decimal_check_fn(column_device_view const& d_strings, int32_t scale)
-    : d_strings(d_strings), scale(scale)
+    : d_strings{d_strings}, scale{scale}
   {
   }
 
@@ -97,7 +100,7 @@ struct string_to_decimal_check_fn {
 
     auto const iter_end = d_str.data() + d_str.size_bytes();
 
-    using UnsignedDecimalType = std::make_unsigned_t<DecimalType>;
+    using UnsignedDecimalType = cuda::std::make_unsigned_t<DecimalType>;
     auto [value, exp_offset]  = parse_integer<UnsignedDecimalType>(iter, iter_end);
 
     // only exponent notation is expected here
@@ -115,7 +118,7 @@ struct string_to_decimal_check_fn {
     // finally, check for overflow based on the exp_ten and scale values
     return (exp_ten < scale) or
            value <= static_cast<UnsignedDecimalType>(
-                      std::numeric_limits<DecimalType>::max() /
+                      cuda::std::numeric_limits<DecimalType>::max() /
                       static_cast<DecimalType>(exp10(static_cast<double>(exp_ten - scale))));
   }
 };
@@ -206,8 +209,8 @@ struct decimal_to_string_size_fn {
 
     if (scale >= 0) return count_digits(value) + scale;
 
-    auto const abs_value = std::abs(value);
-    auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
+    auto const abs_value = numeric::detail::abs(value);
+    auto const exp_ten   = numeric::detail::exp10<DecimalType>(-scale);
     auto const fraction  = count_digits(abs_value % exp_ten);
     auto const num_zeros = std::max(0, (-scale - fraction));
     return static_cast<int32_t>(value < 0) +    // sign if negative
@@ -247,9 +250,9 @@ struct decimal_to_string_fn {
     // write format:   [-]integer.fraction
     // where integer  = abs(value) / (10^abs(scale))
     //       fraction = abs(value) % (10^abs(scale))
-    auto const abs_value = std::abs(value);
     if (value < 0) *d_buffer++ = '-';  // add sign
-    auto const exp_ten   = static_cast<int64_t>(exp10(static_cast<double>(-scale)));
+    auto const abs_value = numeric::detail::abs(value);
+    auto const exp_ten   = numeric::detail::exp10<DecimalType>(-scale);
     auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
 
     d_buffer += integer_to_string(abs_value / exp_ten, d_buffer);  // add the integer part
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 366d4fe7d42..70b5f528213 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -45,7 +45,7 @@ namespace {
  * @brief This function converts the given string into a
  * floating point double value.
  *
- * This will also map strings containing "NaN", "Inf" and "-Inf"
+ * This will also map strings containing "NaN", "Inf", etc.
  * to the appropriate float values.
  *
  * This function will also handle scientific notation format.
@@ -55,16 +55,19 @@ __device__ inline double stod(string_view const& d_str)
   const char* in_ptr = d_str.data();
   const char* end    = in_ptr + d_str.size_bytes();
   if (end == in_ptr) return 0.0;
-  // special strings
-  if (d_str.compare("NaN", 3) == 0) return std::numeric_limits<double>::quiet_NaN();
-  if (d_str.compare("Inf", 3) == 0) return std::numeric_limits<double>::infinity();
-  if (d_str.compare("-Inf", 4) == 0) return -std::numeric_limits<double>::infinity();
   double sign{1.0};
   if (*in_ptr == '-' || *in_ptr == '+') {
     sign = (*in_ptr == '-' ? -1 : 1);
     ++in_ptr;
   }
 
+  // special strings: NaN, Inf
+  if ((in_ptr < end) && *in_ptr > '9') {
+    auto const inf_nan = string_view(in_ptr, static_cast<size_type>(thrust::distance(in_ptr, end)));
+    if (string::is_nan_str(inf_nan)) return std::numeric_limits<double>::quiet_NaN();
+    if (string::is_inf_str(inf_nan)) return sign * std::numeric_limits<double>::infinity();
+  }
+
   // Parse and store the mantissa as much as we can,
   // until we are about to exceed the limit of uint64_t
   constexpr uint64_t max_holding = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh
index 746923526a1..d9ca8159706 100644
--- a/cpp/src/strings/convert/utilities.cuh
+++ b/cpp/src/strings/convert/utilities.cuh
@@ -64,11 +64,12 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer)
     *d_buffer = '0';
     return 1;
   }
-  bool const is_negative = std::is_signed<IntegerType>::value ? (value < 0) : false;
-  //
+  bool const is_negative = cuda::std::is_signed<IntegerType>() ? (value < 0) : false;
+
   constexpr IntegerType base = 10;
-  constexpr int MAX_DIGITS   = 20;  // largest 64-bit integer is 20 digits
-  char digits[MAX_DIGITS];          // place-holder for digit chars
+  // largest 64-bit integer is 20 digits; largest 128-bit integer is 39 digits
+  constexpr int MAX_DIGITS = cuda::std::numeric_limits<IntegerType>::digits10 + 1;
+  char digits[MAX_DIGITS];  // place-holder for digit chars
   int digits_idx = 0;
   while (value != 0) {
     assert(digits_idx < MAX_DIGITS);
@@ -97,36 +98,25 @@ template <typename IntegerType>
 constexpr size_type count_digits(IntegerType value)
 {
   if (value == 0) return 1;
-  bool is_negative = std::is_signed<IntegerType>::value ? (value < 0) : false;
+  bool const is_negative = cuda::std::is_signed<IntegerType>() ? (value < 0) : false;
   // abs(std::numeric_limits<IntegerType>::min()) is negative;
   // for all integer types, the max() and min() values have the same number of digits
-  value = (value == std::numeric_limits<IntegerType>::min())
-            ? std::numeric_limits<IntegerType>::max()
+  value = (value == cuda::std::numeric_limits<IntegerType>::min())
+            ? cuda::std::numeric_limits<IntegerType>::max()
             : cudf::util::absolute_value(value);
-  // largest 8-byte unsigned value is 18446744073709551615 (20 digits)
-  // clang-format off
-  size_type digits =
-    (value < 10 ? 1 :
-    (value < 100 ? 2 :
-    (value < 1000 ? 3 :
-    (value < 10000 ? 4 :
-    (value < 100000 ? 5 :
-    (value < 1000000 ? 6 :
-    (value < 10000000 ? 7 :
-    (value < 100000000 ? 8 :
-    (value < 1000000000 ? 9 :
-    (value < 10000000000 ? 10 :
-    (value < 100000000000 ? 11 :
-    (value < 1000000000000 ? 12 :
-    (value < 10000000000000 ? 13 :
-    (value < 100000000000000 ? 14 :
-    (value < 1000000000000000 ? 15 :
-    (value < 10000000000000000 ? 16 :
-    (value < 100000000000000000 ? 17 :
-    (value < 1000000000000000000 ? 18 :
-    (value < 10000000000000000000 ? 19 :
-    20)))))))))))))))))));
-  // clang-format on
+
+  auto const digits = [value] {
+    // largest 8-byte  unsigned value is 18446744073709551615 (20 digits)
+    // largest 16-byte unsigned value is 340282366920938463463374607431768211455 (39 digits)
+    auto constexpr max_digits = cuda::std::numeric_limits<IntegerType>::digits10 + 1;
+
+    size_type digits = 1;
+    __int128_t pow10 = 10;
+    for (; digits < max_digits; ++digits, pow10 *= 10)
+      if (value < pow10) break;
+    return digits;
+  }();
+
   return digits + static_cast<size_type>(is_negative);
 }
 
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index db8b37a9592..3822fa8bf5a 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -96,7 +96,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
                                    device_views_ptr + views.size(),
                                    std::next(d_partition_offsets.begin()),
                                    chars_size_transform{},
-                                   thrust::plus<size_t>{});
+                                   thrust::plus{});
   auto const output_chars_size = d_partition_offsets.back_element(stream);
   stream.synchronize();  // ensure copy of output_chars_size is complete before returning
 
diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu
index 3ab5b55020c..8d96f0de415 100644
--- a/cpp/src/strings/findall.cu
+++ b/cpp/src/strings/findall.cu
@@ -153,11 +153,8 @@ std::unique_ptr<table> findall_re(
 
   std::vector<std::unique_ptr<column>> results;
 
-  size_type const columns = thrust::reduce(rmm::exec_policy(stream),
-                                           find_counts.begin(),
-                                           find_counts.end(),
-                                           0,
-                                           thrust::maximum<size_type>{});
+  size_type const columns = thrust::reduce(
+    rmm::exec_policy(stream), find_counts.begin(), find_counts.end(), 0, thrust::maximum{});
   // boundary case: if no columns, return all nulls column (issue #119)
   if (columns == 0)
     results.emplace_back(std::make_unique<column>(
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 66e99756615..bc0679993d0 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -276,7 +276,10 @@ __device__ inline int32_t reprog_device::regexec(
             }
             break;
           case EOL:
-            if (last_character || (inst->u1.c == '$' && c == '\n')) {
+            if (last_character ||
+                (c == '\n' && (inst->u1.c == '$' ||
+                               // edge case where \n appears at the end of the string
+                               pos + 1 == dstr.length()))) {
               id_activate = inst->u2.next_id;
               expanded    = true;
             }
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 458f3ed885c..7820e0064a6 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -369,7 +369,7 @@ std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
                              thrust::make_counting_iterator<size_type>(strings_count),
                              fn,
                              int64_t{0},
-                             thrust::plus<int64_t>{});
+                             thrust::plus{});
 
   return std::make_pair(std::move(output_sizes), total_bytes);
 }
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 5113b418501..c6e52a79059 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -490,11 +490,8 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
     });
 
   // the columns_count is the maximum number of tokens for any string
-  auto const columns_count = thrust::reduce(rmm::exec_policy(stream),
-                                            token_counts.begin(),
-                                            token_counts.end(),
-                                            0,
-                                            thrust::maximum<size_type>{});
+  auto const columns_count = thrust::reduce(
+    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
   // boundary case: if no columns, return one null column (custrings issue #119)
   if (columns_count == 0) {
     results.push_back(std::make_unique<column>(
@@ -748,11 +745,8 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
                     [tokenizer] __device__(size_type idx) { return tokenizer.count_tokens(idx); });
 
   // column count is the maximum number of tokens for any string
-  size_type const columns_count = thrust::reduce(rmm::exec_policy(stream),
-                                                 token_counts.begin(),
-                                                 token_counts.end(),
-                                                 0,
-                                                 thrust::maximum<size_type>{});
+  size_type const columns_count = thrust::reduce(
+    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
 
   std::vector<std::unique_ptr<column>> results;
   // boundary case: if no columns, return one null column (issue #119)
diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
deleted file mode 100644
index 319ad730c53..00000000000
--- a/cpp/src/transform/jit/masked_udf_kernel.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstddef>
-#include <cstdint>
-#include <transform/jit/operation-udf.hpp>
-
-#include <cudf/types.hpp>
-#include <cudf/utilities/bit.hpp>
-
-#include <cuda/std/climits>
-#include <cuda/std/cstddef>
-#include <cuda/std/limits>
-#include <cuda/std/tuple>
-#include <cuda/std/type_traits>
-
-namespace cudf {
-namespace transformation {
-namespace jit {
-
-template <typename T>
-struct Masked {
-  T value;
-  bool valid;
-};
-
-template <typename TypeIn, typename MaskType, typename OffsetType>
-__device__ auto make_args(cudf::size_type id, TypeIn in_ptr, MaskType in_mask, OffsetType in_offset)
-{
-  bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
-  return cuda::std::make_tuple(in_ptr[id], valid);
-}
-
-template <typename InType, typename MaskType, typename OffsetType, typename... Arguments>
-__device__ auto make_args(cudf::size_type id,
-                          InType in_ptr,
-                          MaskType in_mask,      // in practice, always cudf::bitmask_type const*
-                          OffsetType in_offset,  // in practice, always cudf::size_type
-                          Arguments... args)
-{
-  bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
-  return cuda::std::tuple_cat(cuda::std::make_tuple(in_ptr[id], valid), make_args(id, args...));
-}
-
-template <typename TypeOut, typename... Arguments>
-__global__ void generic_udf_kernel(cudf::size_type size,
-                                   TypeOut* out_data,
-                                   bool* out_mask,
-                                   Arguments... args)
-{
-  int const tid    = threadIdx.x;
-  int const blkid  = blockIdx.x;
-  int const blksz  = blockDim.x;
-  int const gridsz = gridDim.x;
-  int const start  = tid + blkid * blksz;
-  int const step   = blksz * gridsz;
-
-  Masked<TypeOut> output;
-  for (cudf::size_type i = start; i < size; i += step) {
-    auto func_args = cuda::std::tuple_cat(
-      cuda::std::make_tuple(&output.value),
-      make_args(i, args...)  // passed int64*, bool*, int64, int64*, bool*, int64
-    );
-    cuda::std::apply(GENERIC_OP, func_args);
-    out_data[i] = output.value;
-    out_mask[i] = output.valid;
-  }
-}
-
-}  // namespace jit
-}  // namespace transformation
-}  // namespace cudf
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index efa011ea4a6..06b03a6b36f 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -205,7 +205,7 @@ struct flatten_functor {
                   thrust::optional<int> parent_index)
   {
     // track branch depth as we reach this list and after we pass it
-    size_type const branch_depth_start = cur_branch_depth;
+    auto const branch_depth_start = cur_branch_depth;
     auto const is_list_inside_struct =
       parent_index && out[parent_index.value()].type().id() == type_id::STRUCT;
     if (is_list_inside_struct) {
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 5230b853a79..0cca6699586 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -19,12 +19,10 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
-#include <jit_preprocessed_files/transform/jit/masked_udf_kernel.cu.jit.hpp>
 
 #include <jit/cache.hpp>
 #include <jit/parser.hpp>
@@ -65,80 +63,6 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(input));
 }
 
-std::vector<std::string> make_template_types(column_view outcol_view, table_view const& data_view)
-{
-  std::string mskptr_type =
-    cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::bitmask_type>())) + "*";
-  std::string offset_type =
-    cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::offset_type>()));
-
-  std::vector<std::string> template_types;
-  template_types.reserve((3 * data_view.num_columns()) + 1);
-
-  template_types.push_back(cudf::jit::get_type_name(outcol_view.type()));
-  for (auto const& col : data_view) {
-    template_types.push_back(cudf::jit::get_type_name(col.type()) + "*");
-    template_types.push_back(mskptr_type);
-    template_types.push_back(offset_type);
-  }
-  return template_types;
-}
-
-void generalized_operation(table_view const& data_view,
-                           std::string const& udf,
-                           data_type output_type,
-                           mutable_column_view outcol_view,
-                           mutable_column_view outmsk_view,
-                           rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
-{
-  auto const template_types = make_template_types(outcol_view, data_view);
-
-  std::string generic_kernel_name =
-    jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
-      .instantiate(template_types);
-
-  std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
-    udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
-
-  std::vector<void*> kernel_args;
-  kernel_args.reserve((data_view.num_columns() * 3) + 3);
-
-  cudf::size_type size   = outcol_view.size();
-  const void* outcol_ptr = cudf::jit::get_data_ptr(outcol_view);
-  const void* outmsk_ptr = cudf::jit::get_data_ptr(outmsk_view);
-  kernel_args.insert(kernel_args.begin(), {&size, &outcol_ptr, &outmsk_ptr});
-
-  std::vector<const void*> data_ptrs;
-  std::vector<cudf::bitmask_type const*> mask_ptrs;
-  std::vector<cudf::offset_type> offsets;
-
-  data_ptrs.reserve(data_view.num_columns());
-  mask_ptrs.reserve(data_view.num_columns());
-  offsets.reserve(data_view.num_columns());
-
-  auto const iters = thrust::make_zip_iterator(
-    thrust::make_tuple(data_ptrs.begin(), mask_ptrs.begin(), offsets.begin()));
-
-  std::for_each(iters, iters + data_view.num_columns(), [&](auto const& tuple_vals) {
-    kernel_args.push_back(&thrust::get<0>(tuple_vals));
-    kernel_args.push_back(&thrust::get<1>(tuple_vals));
-    kernel_args.push_back(&thrust::get<2>(tuple_vals));
-  });
-
-  std::transform(data_view.begin(), data_view.end(), iters, [&](column_view const& col) {
-    return thrust::make_tuple(cudf::jit::get_data_ptr(col), col.null_mask(), col.offset());
-  });
-
-  cudf::jit::get_program_cache(*transform_jit_masked_udf_kernel_cu_jit)
-    .get_kernel(generic_kernel_name,
-                {},
-                {{"transform/jit/operation-udf.hpp", generic_cuda_source}},
-                {"-arch=sm_."})
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())
-    ->launch(kernel_args.data());
-}
-
 }  // namespace jit
 }  // namespace transformation
 
@@ -165,24 +89,6 @@ std::unique_ptr<column> transform(column_view const& input,
   return output;
 }
 
-std::unique_ptr<column> generalized_masked_op(table_view const& data_view,
-                                              std::string const& udf,
-                                              data_type output_type,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
-{
-  std::unique_ptr<column> output = make_fixed_width_column(output_type, data_view.num_rows());
-  std::unique_ptr<column> output_mask =
-    make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, data_view.num_rows());
-
-  transformation::jit::generalized_operation(
-    data_view, udf, output_type, *output, *output_mask, stream, mr);
-
-  auto final_output_mask = cudf::bools_to_mask(*output_mask);
-  output.get()->set_null_mask(std::move(*(final_output_mask.first)));
-  return output;
-}
-
 }  // namespace detail
 
 std::unique_ptr<column> transform(column_view const& input,
@@ -195,12 +101,4 @@ std::unique_ptr<column> transform(column_view const& input,
   return detail::transform(input, unary_udf, output_type, is_ptx, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> generalized_masked_op(table_view const& data_view,
-                                              std::string const& udf,
-                                              data_type output_type,
-                                              rmm::mr::device_memory_resource* mr)
-{
-  return detail::generalized_masked_op(data_view, udf, output_type, rmm::cuda_stream_default, mr);
-}
-
 }  // namespace cudf
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index c316b2c6f4e..131fde11cf8 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -160,7 +160,7 @@ struct device_cast {
  * @brief Takes a `fixed_point` column_view as @p input and returns a `fixed_point` column with new
  * @p scale
  *
- * @tparam T     Type of the `fixed_point` column_view (`decimal32` or `decimal64`)
+ * @tparam T     Type of the `fixed_point` column_view (`decimal32`, `decimal64` or `decimal128`)
  * @param input  Input `column_view`
  * @param scale  `scale` of the returned `column`
  * @param stream CUDA stream used for device memory operations and kernel launches
@@ -176,7 +176,7 @@ std::unique_ptr<column> rescale(column_view input,
 {
   using namespace numeric;
 
-  if (input.type().scale() > scale) {
+  if (input.type().scale() >= scale) {
     auto const scalar = make_fixed_point_scalar<T>(0, scale_type{scale});
     auto const type   = cudf::data_type{cudf::type_to_id<T>(), scale};
     return detail::binary_operation(input, *scalar, binary_operator::ADD, type, stream, mr);
@@ -305,28 +305,39 @@ struct dispatch_unary_cast_to {
                                      rmm::mr::device_memory_resource* mr)
   {
     using namespace numeric;
-
-    auto const size = input.size();
-    auto temporary =
-      std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
-                               size,
-                               rmm::device_buffer{size * cudf::size_of(type), stream},
-                               copy_bitmask(input, stream),
-                               input.null_count());
-
     using SourceDeviceT = device_storage_type_t<SourceT>;
     using TargetDeviceT = device_storage_type_t<TargetT>;
 
-    mutable_column_view output_mutable = *temporary;
-
-    thrust::transform(rmm::exec_policy(stream),
-                      input.begin<SourceDeviceT>(),
-                      input.end<SourceDeviceT>(),
-                      output_mutable.begin<TargetDeviceT>(),
-                      device_cast<SourceDeviceT, TargetDeviceT>{});
-
-    // clearly there is a more efficient way to do this, can optimize in the future
-    return rescale<TargetT>(*temporary, numeric::scale_type{type.scale()}, stream, mr);
+    auto casted = [&]() {
+      auto const size = input.size();
+      auto output     = std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
+                                             size,
+                                             rmm::device_buffer{size * cudf::size_of(type), stream},
+                                             copy_bitmask(input, stream),
+                                             input.null_count());
+
+      mutable_column_view output_mutable = *output;
+
+      thrust::transform(rmm::exec_policy(stream),
+                        input.begin<SourceDeviceT>(),
+                        input.end<SourceDeviceT>(),
+                        output_mutable.begin<TargetDeviceT>(),
+                        device_cast<SourceDeviceT, TargetDeviceT>{});
+
+      return output;
+    };
+
+    if (input.type().scale() == type.scale()) return casted();
+
+    if constexpr (sizeof(SourceDeviceT) < sizeof(TargetDeviceT)) {
+      // device_cast BEFORE rescale when SourceDeviceT is < TargetDeviceT
+      auto temporary = casted();
+      return detail::rescale<TargetT>(*temporary, scale_type{type.scale()}, stream, mr);
+    } else {
+      // device_cast AFTER rescale when SourceDeviceT is > TargetDeviceT to avoid overflow
+      auto temporary = detail::rescale<SourceT>(input, scale_type{type.scale()}, stream, mr);
+      return detail::cast(*temporary, type, stream, mr);
+    }
   }
 
   template <typename TargetT,
@@ -338,9 +349,9 @@ struct dispatch_unary_cast_to {
 
   {
     if (!cudf::is_fixed_width<TargetT>())
-      CUDF_FAIL("Column type must be numeric or chrono or decimal32/64");
+      CUDF_FAIL("Column type must be numeric or chrono or decimal32/64/128");
     else if (cudf::is_fixed_point<SourceT>())
-      CUDF_FAIL("Currently only decimal32/64 to floating point/integral is supported");
+      CUDF_FAIL("Currently only decimal32/64/128 to floating point/integral is supported");
     else if (cudf::is_timestamp<SourceT>() && is_numeric<TargetT>())
       CUDF_FAIL("Timestamps can be created only from duration");
     else
@@ -364,7 +375,7 @@ struct dispatch_unary_cast_from {
   template <typename T, typename... Args>
   std::enable_if_t<!cudf::is_fixed_width<T>(), std::unique_ptr<column>> operator()(Args&&...)
   {
-    CUDF_FAIL("Column type must be numeric or chrono or decimal32/64");
+    CUDF_FAIL("Column type must be numeric or chrono or decimal32/64/128");
   }
 };
 }  // anonymous namespace
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index e5b20243810..a938fc4d650 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -271,7 +271,7 @@ struct fixed_point_floor {
 template <typename T>
 struct fixed_point_abs {
   T n;
-  __device__ T operator()(T data) { return std::abs(data); }
+  __device__ T operator()(T data) { return numeric::detail::abs(data); }
 };
 
 template <typename T, template <typename> typename FixedPointFunctor>
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 98bade7e15f..c1c209b2413 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -164,7 +164,6 @@ ConfigureTest(
   BINARY_TEST
   binaryop/binop-verify-input-test.cpp
   binaryop/binop-null-test.cpp
-  binaryop/binop-integration-test.cpp
   binaryop/binop-compiled-test.cpp
   binaryop/binop-compiled-fixed_point-test.cpp
   binaryop/binop-generic-ptx-test.cpp
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index 7925f0dd618..5020fbf898b 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -684,4 +684,44 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpThrows)
                cudf::logic_error);
 }
 
+template <typename T>
+struct FixedPointTest_64_128_Reps : public cudf::test::BaseFixture {
+};
+
+using Decimal64And128Types = cudf::test::Types<numeric::decimal64, numeric::decimal128>;
+TYPED_TEST_SUITE(FixedPointTest_64_128_Reps, Decimal64And128Types);
+
+TYPED_TEST(FixedPointTest_64_128_Reps, FixedPoint_64_128_ComparisonTests)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  for (auto const rhs_value : {10000000000000000, 100000000000000000}) {
+    auto const lhs       = fp_wrapper<RepType>{{33041, 97290, 36438, 25379, 48473}, scale_type{2}};
+    auto const rhs       = make_fixed_point_scalar<decimalXX>(rhs_value, scale_type{0});
+    auto const trues     = wrapper<bool>{{1, 1, 1, 1, 1}};
+    auto const falses    = wrapper<bool>{{0, 0, 0, 0, 0}};
+    auto const bool_type = cudf::data_type{type_id::BOOL8};
+
+    auto const a = cudf::binary_operation(lhs, *rhs, binary_operator::LESS, bool_type);
+    auto const b = cudf::binary_operation(lhs, *rhs, binary_operator::LESS_EQUAL, bool_type);
+    auto const c = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER, bool_type);
+    auto const d = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER_EQUAL, bool_type);
+    auto const e = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER, bool_type);
+    auto const f = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER_EQUAL, bool_type);
+    auto const g = cudf::binary_operation(*rhs, lhs, binary_operator::LESS, bool_type);
+    auto const h = cudf::binary_operation(*rhs, lhs, binary_operator::LESS_EQUAL, bool_type);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(a->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(b->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(c->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(d->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(e->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(f->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(g->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(h->view(), falses);
+  }
+}
+
 }  // namespace cudf::test::binop
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index a177a8378b7..37212c30d80 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -107,21 +107,27 @@ struct BinaryOperationCompiledTest : public BinaryOperationTest {
 // t      	     	t + d
 // d      	d + t	d + d
 
-using Add_types =
-  cudf::test::Types<cudf::test::Types<bool, bool, float>,
-                    cudf::test::Types<int16_t, double, uint8_t>,
-                    cudf::test::Types<timestamp_s, timestamp_s, duration_s>,
-                    cudf::test::Types<timestamp_ns, duration_ms, timestamp_us>,
-                    cudf::test::Types<duration_us, duration_us, duration_D>,
-                    // cudf::test::Types<duration_s, int16_t, int64_t>, //valid
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>,
-                    // Extras
-                    cudf::test::Types<duration_D, duration_D, duration_D>,
-                    cudf::test::Types<timestamp_D, timestamp_D, duration_D>,
-                    cudf::test::Types<timestamp_s, timestamp_D, duration_s>,
-                    cudf::test::Types<timestamp_ms, timestamp_ms, duration_s>,
-                    cudf::test::Types<timestamp_ns, timestamp_ms, duration_ns>>;
+using namespace numeric;
+
+using Add_types = cudf::test::Types<cudf::test::Types<bool, bool, float>,
+                                    cudf::test::Types<int16_t, double, uint8_t>,
+                                    cudf::test::Types<timestamp_s, timestamp_s, duration_s>,
+                                    cudf::test::Types<timestamp_ns, duration_ms, timestamp_us>,
+                                    cudf::test::Types<duration_us, duration_us, duration_D>,
+                                    // cudf::test::Types<duration_s, int16_t, int64_t>, //valid
+                                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                                    cudf::test::Types<int, decimal32, decimal32>,
+                                    cudf::test::Types<int, decimal64, decimal64>,
+                                    cudf::test::Types<int, decimal128, decimal128>,
+                                    // Extras
+                                    cudf::test::Types<duration_D, duration_D, duration_D>,
+                                    cudf::test::Types<timestamp_D, timestamp_D, duration_D>,
+                                    cudf::test::Types<timestamp_s, timestamp_D, duration_s>,
+                                    cudf::test::Types<timestamp_ms, timestamp_ms, duration_s>,
+                                    cudf::test::Types<timestamp_ns, timestamp_ms, duration_ns>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_Add : public BinaryOperationCompiledTest<T> {
 };
@@ -144,8 +150,13 @@ using Sub_types =
                     cudf::test::Types<timestamp_s, timestamp_D, duration_s>,  // t - d
                     cudf::test::Types<duration_ns, duration_us, duration_s>,  // d - d
                     cudf::test::Types<duration_us, duration_us, duration_s>,  // d - d
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>>;
+                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                    cudf::test::Types<int, decimal32, decimal32>,
+                    cudf::test::Types<int, decimal64, decimal64>,
+                    cudf::test::Types<int, decimal128, decimal128>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_Sub : public BinaryOperationCompiledTest<T> {
 };
@@ -161,14 +172,20 @@ TYPED_TEST(BinaryOperationCompiledTest_Sub, Vector_Vector)
 // n n * n	     	n * d
 // t
 // d d * n
-using Mul_types =
-  cudf::test::Types<cudf::test::Types<int32_t, u_int64_t, float>,
-                    cudf::test::Types<duration_s, u_int64_t, duration_s>,
-                    cudf::test::Types<duration_ms, duration_D, int16_t>,
-                    cudf::test::Types<duration_ns, duration_us, uint8_t>,
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<numeric::decimal32, int, int>>;
+using Mul_types = cudf::test::Types<cudf::test::Types<int32_t, u_int64_t, float>,
+                                    cudf::test::Types<duration_s, u_int64_t, duration_s>,
+                                    cudf::test::Types<duration_ms, duration_D, int16_t>,
+                                    cudf::test::Types<duration_ns, duration_us, uint8_t>,
+                                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                                    cudf::test::Types<int, decimal32, decimal32>,
+                                    cudf::test::Types<int, decimal64, decimal64>,
+                                    cudf::test::Types<int, decimal128, decimal128>,
+                                    cudf::test::Types<decimal32, int, int>,
+                                    cudf::test::Types<decimal64, int, int>,
+                                    cudf::test::Types<decimal128, int, int>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_Mul : public BinaryOperationCompiledTest<T> {
 };
@@ -184,16 +201,20 @@ TYPED_TEST(BinaryOperationCompiledTest_Mul, Vector_Vector)
 // n n / n
 // t
 // d d / n	     	d / d
-using Div_types =
-  cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
-                    cudf::test::Types<double, int8_t, int64_t>,
-                    cudf::test::Types<duration_ms, duration_s, u_int32_t>,
-                    cudf::test::Types<duration_ns, duration_D, int16_t>,
-                    cudf::test::Types<double, duration_D, duration_ns>,
-                    cudf::test::Types<float, duration_ms, duration_ns>,
-                    cudf::test::Types<u_int64_t, duration_us, duration_ns>,
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>>;
+using Div_types = cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
+                                    cudf::test::Types<double, int8_t, int64_t>,
+                                    cudf::test::Types<duration_ms, duration_s, u_int32_t>,
+                                    cudf::test::Types<duration_ns, duration_D, int16_t>,
+                                    cudf::test::Types<double, duration_D, duration_ns>,
+                                    cudf::test::Types<float, duration_ms, duration_ns>,
+                                    cudf::test::Types<u_int64_t, duration_us, duration_ns>,
+                                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                                    cudf::test::Types<int, decimal32, decimal32>,
+                                    cudf::test::Types<int, decimal64, decimal64>,
+                                    cudf::test::Types<int, decimal128, decimal128>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_Div : public BinaryOperationCompiledTest<T> {
 };
@@ -209,13 +230,11 @@ TYPED_TEST(BinaryOperationCompiledTest_Div, Vector_Vector)
 // n n / n
 // t
 // d
-using TrueDiv_types =
-  cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
-                    cudf::test::Types<double, int8_t, int64_t>,
-                    cudf::test::Types<int8_t, bool, u_int32_t>,
-                    cudf::test::Types<u_int64_t, float, int16_t>,
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<int, numeric::decimal32, numeric::decimal32>>;
+using TrueDiv_types = cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int64_t>,
+                                        cudf::test::Types<double, int8_t, int64_t>,
+                                        cudf::test::Types<int8_t, bool, u_int32_t>,
+                                        cudf::test::Types<u_int64_t, float, int16_t>>;
+
 template <typename T>
 struct BinaryOperationCompiledTest_TrueDiv : public BinaryOperationCompiledTest<T> {
 };
@@ -458,16 +477,17 @@ TYPED_TEST(BinaryOperationCompiledTest_Logical, LogicalOr_Vector_Vector)
 
 // Comparison Operations ==, !=, <, >, <=, >=
 // n<!=>n, t<!=>t, d<!=>d, s<!=>s, dc<!=>dc
-using Comparison_types =
-  cudf::test::Types<cudf::test::Types<bool, int8_t, int16_t>,
-                    cudf::test::Types<bool, uint32_t, uint16_t>,
-                    cudf::test::Types<bool, uint64_t, double>,
-                    cudf::test::Types<bool, timestamp_D, timestamp_s>,
-                    cudf::test::Types<bool, timestamp_ns, timestamp_us>,
-                    cudf::test::Types<bool, duration_ns, duration_ns>,
-                    cudf::test::Types<bool, duration_us, duration_s>,
-                    cudf::test::Types<bool, std::string, std::string>,
-                    cudf::test::Types<bool, numeric::decimal32, numeric::decimal32>>;
+using Comparison_types = cudf::test::Types<cudf::test::Types<bool, int8_t, int16_t>,
+                                           cudf::test::Types<bool, uint32_t, uint16_t>,
+                                           cudf::test::Types<bool, uint64_t, double>,
+                                           cudf::test::Types<bool, timestamp_D, timestamp_s>,
+                                           cudf::test::Types<bool, timestamp_ns, timestamp_us>,
+                                           cudf::test::Types<bool, duration_ns, duration_ns>,
+                                           cudf::test::Types<bool, duration_us, duration_s>,
+                                           cudf::test::Types<bool, std::string, std::string>,
+                                           cudf::test::Types<bool, decimal32, decimal32>,
+                                           cudf::test::Types<bool, decimal64, decimal64>,
+                                           cudf::test::Types<bool, decimal128, decimal128>>;
 
 template <typename T>
 struct BinaryOperationCompiledTest_Comparison : public BinaryOperationCompiledTest<T> {
@@ -519,9 +539,15 @@ using Null_types =
                     cudf::test::Types<timestamp_s, timestamp_D, timestamp_s>,
                     cudf::test::Types<duration_ns, duration_us, duration_s>,
                     // cudf::test::Types<std::string, std::string, std::string>, // only fixed-width
-                    cudf::test::Types<numeric::decimal32, numeric::decimal32, numeric::decimal32>,
-                    cudf::test::Types<numeric::decimal32, uint32_t, numeric::decimal32>,
-                    cudf::test::Types<int64_t, numeric::decimal64, int64_t>>;
+                    cudf::test::Types<decimal32, decimal32, decimal32>,
+                    cudf::test::Types<decimal64, decimal64, decimal64>,
+                    cudf::test::Types<decimal128, decimal128, decimal128>,
+                    cudf::test::Types<decimal32, uint32_t, decimal32>,
+                    cudf::test::Types<decimal64, uint32_t, decimal64>,
+                    cudf::test::Types<decimal128, uint32_t, decimal128>,
+                    cudf::test::Types<int64_t, decimal32, decimal32>,
+                    cudf::test::Types<int64_t, decimal64, decimal64>,
+                    cudf::test::Types<int64_t, decimal128, decimal128>>;
 
 template <typename T>
 struct BinaryOperationCompiledTest_NullOps : public BinaryOperationCompiledTest<T> {
@@ -653,3 +679,5 @@ TEST_F(BinaryOperationCompiledTest_NullOpsString, NullMin_Vector_Vector)
 }
 
 }  // namespace cudf::test::binop
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/binaryop/binop-generic-ptx-test.cpp b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
index 6e35bdac41c..f4407834786 100644
--- a/cpp/tests/binaryop/binop-generic-ptx-test.cpp
+++ b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
@@ -20,12 +20,18 @@
 #include <cudf/binaryop.hpp>
 
 #include <tests/binaryop/assert-binops.h>
+#include <tests/binaryop/util/runtime_support.h>
 #include <tests/binaryop/binop-fixture.hpp>
 
 namespace cudf {
 namespace test {
 namespace binop {
 struct BinaryOperationGenericPTXTest : public BinaryOperationTest {
+ protected:
+  void SetUp() override
+  {
+    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
+  }
 };
 
 TEST_F(BinaryOperationGenericPTXTest, CAdd_Vector_Vector_FP32_FP32_FP32)
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
deleted file mode 100644
index 9cdd03fdd62..00000000000
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ /dev/null
@@ -1,2680 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/binaryop.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/types.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <tests/binaryop/assert-binops.h>
-#include <tests/binaryop/binop-fixture.hpp>
-#include "cudf/utilities/error.hpp"
-
-namespace cudf {
-namespace test {
-namespace binop {
-
-constexpr debug_output_level verbosity{debug_output_level::ALL_ERRORS};
-
-struct BinaryOperationIntegrationTest : public BinaryOperationTest {
-};
-
-TEST_F(BinaryOperationIntegrationTest, Add_Scalar_Vector_SI32_FP32_SI64)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = float;
-  using TypeRhs = int64_t;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_SI32_FP32_FP32)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Scalar_Vector_SI32_FP32_FP32)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = float;
-  using TypeRhs = int64_t;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Scalar_SI08_SI16_SI32)
-{
-  using TypeOut = int8_t;
-  using TypeLhs = int16_t;
-  using TypeRhs = int32_t;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_SI32_FP64_SI08)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = double;
-  using TypeRhs = int8_t;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Scalar_SI64_FP64_SI32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = double;
-  using TypeRhs = int32_t;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Vector_TimepointD_DurationS_TimepointUS)
-{
-  using TypeOut = cudf::timestamp_us;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::duration_s;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Scalar_TimepointD_TimepointS_DurationS)
-{
-  using TypeOut = cudf::duration_s;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::timestamp_s;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{34}, true);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Scalar_Vector_DurationS_DurationD_DurationMS)
-{
-  using TypeOut = cudf::duration_ms;
-  using TypeLhs = cudf::duration_s;
-  using TypeRhs = cudf::duration_D;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{-9});
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using MUL = cudf::library::operation::Mul<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_SI64_FP32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using MUL = cudf::library::operation::Mul<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mul_Scalar_Vector_SI32_DurationD_DurationMS)
-{
-  // Double the duration of days and convert the time interval to ms
-  using TypeOut = cudf::duration_ms;
-  using TypeLhs = int32_t;
-  using TypeRhs = cudf::duration_D;
-
-  using MUL = cudf::library::operation::Mul<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(2);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_DurationS_SI32_DurationNS)
-{
-  // Multiple each duration with some random value and promote the result
-  using TypeOut = cudf::duration_ns;
-  using TypeLhs = cudf::duration_s;
-  using TypeRhs = int32_t;
-
-  using MUL = cudf::library::operation::Mul<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using DIV = cudf::library::operation::Div<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_SI64_FP32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using DIV = cudf::library::operation::Div<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Div_Scalar_Vector_DurationD_SI32_DurationS)
-{
-  using TypeOut = cudf::duration_s;
-  using TypeLhs = cudf::duration_D;
-  using TypeRhs = int64_t;
-
-  using DIV = cudf::library::operation::Div<TypeOut, TypeLhs, TypeRhs>;
-
-  // Divide 2 days by an integer and convert the ticks to seconds
-  auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{2});
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_DurationD_DurationS_DurationMS)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = cudf::duration_D;
-  using TypeRhs = cudf::duration_s;
-
-  using DIV = cudf::library::operation::Div<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, TrueDiv_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using TRUEDIV = cudf::library::operation::TrueDiv<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::TRUE_DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, TRUEDIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, FloorDiv_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using FLOORDIV = cudf::library::operation::FloorDiv<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::FLOOR_DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, FLOORDIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, FloorDiv_Vector_Vector_SI64_FP32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using FLOORDIV = cudf::library::operation::FloorDiv<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::FLOOR_DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, FLOORDIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_FP32)
-{
-  using TypeOut = float;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_SI64_FP32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = double;
-  using TypeRhs = double;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Scalar_DurationD_SI32_DurationUS)
-{
-  using TypeOut = cudf::duration_us;
-  using TypeLhs = cudf::duration_D;
-  using TypeRhs = int64_t;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  // Half the number of days and convert the remainder ticks to microseconds
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = cudf::scalar_type_t<TypeRhs>(2);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Scalar_DurationS_DurationMS_DurationUS)
-{
-  using TypeOut = cudf::duration_us;
-  using TypeLhs = cudf::duration_s;
-  using TypeRhs = cudf::duration_ms;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Pow_Vector_Vector_FP64_SI64_SI64)
-{
-  using TypeOut = double;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using POW = cudf::library::operation::Pow<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
-
-  /**
-   * According to CUDA Programming Guide, 'E.1. Standard Functions', 'Table 7 - Double-Precision
-   * Mathematical Standard Library Functions with Maximum ULP Error'
-   * The pow function has 2 (full range) maximum ulp error.
-   */
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, POW(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, Pow_Vector_Vector_FP32)
-{
-  using TypeOut = float;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using POW = cudf::library::operation::Pow<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
-  /**
-   * According to CUDA Programming Guide, 'E.1. Standard Functions', 'Table 7 - Double-Precision
-   * Mathematical Standard Library Functions with Maximum ULP Error'
-   * The pow function has 2 (full range) maximum ulp error.
-   */
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, POW(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, And_Vector_Vector_SI16_SI64_SI32)
-{
-  using TypeOut = int16_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int32_t;
-
-  using AND = cudf::library::operation::BitwiseAnd<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::BITWISE_AND, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, AND());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Or_Vector_Vector_SI64_SI16_SI32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int16_t;
-  using TypeRhs = int32_t;
-
-  using OR = cudf::library::operation::BitwiseOr<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::BITWISE_OR, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, OR());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Xor_Vector_Vector_SI32_SI16_SI64)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using XOR = cudf::library::operation::BitwiseXor<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::BITWISE_XOR, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, XOR());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Logical_And_Vector_Vector_SI16_FP64_SI8)
-{
-  using TypeOut = int16_t;
-  using TypeLhs = double;
-  using TypeRhs = int8_t;
-
-  using AND = cudf::library::operation::LogicalAnd<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOGICAL_AND, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, AND());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Logical_Or_Vector_Vector_B8_SI16_SI64)
-{
-  using TypeOut = bool;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using OR = cudf::library::operation::LogicalOr<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOGICAL_OR, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, OR());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Scalar_Vector_B8_TSS_TSS)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_s;
-  using TypeRhs = cudf::timestamp_s;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(10);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Greater_Scalar_Vector_B8_TSMS_TSS)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_ms;
-  using TypeRhs = cudf::timestamp_s;
-
-  using GREATER = cudf::library::operation::Greater<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Vector_Vector_B8_TSS_TSS)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_s;
-  using TypeRhs = cudf::timestamp_s;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10);
-  auto rhs = make_random_wrapped_column<TypeRhs>(10);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Greater_Vector_Vector_B8_TSMS_TSS)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_ms;
-  using TypeRhs = cudf::timestamp_s;
-
-  using GREATER = cudf::library::operation::Greater<TypeOut, TypeLhs, TypeRhs>;
-
-  cudf::test::UniformRandomGenerator<long> rand_gen(1, 10);
-  auto itr = cudf::detail::make_counting_transform_iterator(
-    0, [&rand_gen](auto row) { return rand_gen.generate() * 1000; });
-
-  cudf::test::fixed_width_column_wrapper<TypeLhs, typename decltype(itr)::value_type> lhs(
-    itr, itr + 100, make_validity_iter());
-
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Scalar_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::string_scalar("eee");
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Vector_Scalar_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto rhs = cudf::string_scalar("eee");
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Greater_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using GREATER = cudf::library::operation::Greater<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Equal_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using EQUAL = cudf::library::operation::Equal<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::EQUAL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, EQUAL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Equal_Vector_Scalar_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using EQUAL = cudf::library::operation::Equal<TypeOut, TypeLhs, TypeRhs>;
-
-  auto rhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto lhs = cudf::string_scalar("");
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::EQUAL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, EQUAL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, LessEqual_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using LESS_EQUAL = cudf::library::operation::LessEqual<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS_EQUAL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS_EQUAL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, GreaterEqual_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using GREATER_EQUAL = cudf::library::operation::GreaterEqual<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::GREATER_EQUAL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER_EQUAL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_LEFT = cudf::library::operation::ShiftLeft<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Vector_SI32_SI16_SI64)
-{
-  using TypeOut = int;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using SHIFT_LEFT = cudf::library::operation::ShiftLeft<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Scalar_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_LEFT = cudf::library::operation::ShiftLeft<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Scalar_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_LEFT = cudf::library::operation::ShiftLeft<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT = cudf::library::operation::ShiftRight<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Vector_SI32_SI16_SI64)
-{
-  using TypeOut = int;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using SHIFT_RIGHT = cudf::library::operation::ShiftRight<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRight_Scalar_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT = cudf::library::operation::ShiftRight<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Scalar_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT = cudf::library::operation::ShiftRight<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  int num_els = 4;
-
-  TypeLhs lhs[] = {-8, 78, -93, 0, -INT_MAX};
-  cudf::test::fixed_width_column_wrapper<TypeLhs> lhs_w(lhs, lhs + num_els);
-
-  TypeRhs shift[] = {1, 1, 3, 2, 16};
-  cudf::test::fixed_width_column_wrapper<TypeRhs> shift_w(shift, shift + num_els);
-
-  TypeOut expected[] = {2147483644, 39, 536870900, 0, 32768};
-  cudf::test::fixed_width_column_wrapper<TypeOut> expected_w(expected, expected + num_els);
-
-  auto out = cudf::jit::binary_operation(
-    lhs_w, shift_w, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_w);
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Vector_SI32_SI16_SI64)
-{
-  using TypeOut = int;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using SHIFT_RIGHT_UNSIGNED =
-    cudf::library::operation::ShiftRightUnsigned<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Scalar_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT_UNSIGNED =
-    cudf::library::operation::ShiftRightUnsigned<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Scalar_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT_UNSIGNED =
-    cudf::library::operation::ShiftRightUnsigned<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
-}
-
-TEST_F(BinaryOperationIntegrationTest, LogBase_Vector_Scalar_SI32_SI32_float)
-{
-  using TypeOut = int;      // Cast the result value to int for easy comparison
-  using TypeLhs = int32_t;  // All input types get converted into doubles
-  using TypeRhs = float;
-
-  using LOG_BASE = cudf::library::operation::LogBase<TypeOut, TypeLhs, TypeRhs>;
-
-  // Make sure there are no zeros. The log value is purposefully cast to int for easy comparison
-  auto elements = cudf::detail::make_counting_transform_iterator(1, [](auto i) { return i + 10; });
-  fixed_width_column_wrapper<TypeLhs> lhs(elements, elements + 100);
-  // Find log to the base 10
-  auto rhs = numeric_scalar<TypeRhs>(10);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
-}
-
-TEST_F(BinaryOperationIntegrationTest, LogBase_Scalar_Vector_float_SI32)
-{
-  using TypeOut = float;
-  using TypeLhs = int;
-  using TypeRhs = int;  // Integral types promoted to double
-
-  using LOG_BASE = cudf::library::operation::LogBase<TypeOut, TypeLhs, TypeRhs>;
-
-  // Make sure there are no zeros
-  auto elements = cudf::detail::make_counting_transform_iterator(1, [](auto i) { return i + 30; });
-  fixed_width_column_wrapper<TypeRhs> rhs(elements, elements + 100);
-  // Find log to the base 2
-  auto lhs = numeric_scalar<TypeLhs>(2);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
-}
-
-TEST_F(BinaryOperationIntegrationTest, LogBase_Vector_Vector_double_SI64_SI32)
-{
-  using TypeOut = double;
-  using TypeLhs = int64_t;
-  using TypeRhs = int32_t;  // Integral types promoted to double
-
-  using LOG_BASE = cudf::library::operation::LogBase<TypeOut, TypeLhs, TypeRhs>;
-
-  // Make sure there are no zeros
-  auto elements =
-    cudf::detail::make_counting_transform_iterator(1, [](auto i) { return std::pow(2, i); });
-  fixed_width_column_wrapper<TypeLhs> lhs(elements, elements + 50);
-
-  // Find log to the base 7
-  auto rhs_elements = cudf::detail::make_counting_transform_iterator(0, [](auto) { return 7; });
-  fixed_width_column_wrapper<TypeRhs> rhs(rhs_elements, rhs_elements + 50);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_SI32_SI32)
-{
-  using TypeOut = bool;
-  using TypeLhs = int32_t;
-  using TypeRhs = int32_t;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX}, {true, true, true, false}};
-  auto int_scalar = cudf::scalar_type_t<TypeRhs>(999);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, int_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, false, false, false}, {true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_ScalarInvalid_B8_SI32_SI32)
-{
-  using TypeOut = bool;
-  using TypeLhs = int32_t;
-  using TypeRhs = int32_t;
-
-  auto int_col    = fixed_width_column_wrapper<TypeLhs>{{-INT32_MAX, -37, 0, 499, 44, INT32_MAX},
-                                                     {false, true, false, true, true, false}};
-  auto int_scalar = cudf::scalar_type_t<TypeRhs>(999);
-  int_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, int_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col,
-                                 fixed_width_column_wrapper<bool>{
-                                   {true, false, true, false, false, true},
-                                   {true, true, true, true, true, true},
-                                 },
-                                 verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_tsD_tsD)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::timestamp_D;
-
-  cudf::test::fixed_width_column_wrapper<TypeLhs, TypeLhs::rep> ts_col{
-    {
-      999,    // Random nullable field
-      0,      // This is the UNIX epoch - 1970-01-01
-      44376,  // 2091-07-01 00:00:00 GMT
-      47695,  // 2100-08-02 00:00:00 GMT
-      3,      // Random nullable field
-      66068,  // 2150-11-21 00:00:00 GMT
-      22270,  // 2030-12-22 00:00:00 GMT
-      111,    // Random nullable field
-    },
-    {false, true, true, true, false, true, true, false}};
-  auto ts_scalar = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{44376}, true);
-
-  auto op_col = cudf::jit::binary_operation(
-    ts_scalar, ts_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col,
-                                 fixed_width_column_wrapper<bool>{
-                                   {false, false, true, false, false, false, false, false},
-                                   {true, true, true, true, true, true, true, true},
-                                 },
-                                 verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_string_EmptyString)
-{
-  using TypeOut = bool;
-
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, false, true, true, true, false, true});
-  // Empty string
-  cudf::string_scalar str_scalar("");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, true, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_string_ValidString)
-{
-  using TypeOut = bool;
-
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, false, true, true, true, false, true});
-  // Match a valid string
-  cudf::string_scalar str_scalar("<null>");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, true, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_string_NoMatch)
-{
-  using TypeOut = bool;
-
-  // Try with non nullable input
-  auto str_col =
-    cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  // Matching a string that isn't present
-  cudf::string_scalar str_scalar("foo");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_string_NullNonNull)
-{
-  using TypeOut = bool;
-
-  // Try with all invalid input
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, true, true, true, true, true, true});
-  // Matching a scalar that is invalid
-  cudf::string_scalar str_scalar("foo");
-  str_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_string_NullNonNull)
-{
-  using TypeOut = bool;
-
-  // Try with all invalid input
-  auto str_col =
-    cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                       {false, false, false, false, false, false, false});
-  // Matching a scalar that is valid
-  cudf::string_scalar str_scalar("foo");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_string_NullNull)
-{
-  using TypeOut = bool;
-
-  // Try with all invalid input
-  auto str_col =
-    cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                       {false, false, false, false, false, false, false});
-  // Matching a scalar that is invalid
-  cudf::string_scalar str_scalar("foo");
-  str_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, true, true, true, true, true, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_string_MatchInvalid)
-{
-  using TypeOut = bool;
-
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, false, true, true, true, false, true});
-  // Matching an invalid string
-  cudf::string_scalar str_scalar("bb");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_InvalidScalar_B8_string_string)
-{
-  using TypeOut = bool;
-
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, false, true, true, true, false, true});
-  // Valid string invalidated
-  cudf::string_scalar str_scalar("bb");
-  str_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, true, false, false, false, true, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_tsD_tsD_NonNullable)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::timestamp_D;
-
-  cudf::test::fixed_width_column_wrapper<TypeLhs, TypeLhs::rep> lhs_col{
-    0,      // This is the UNIX epoch - 1970-01-01
-    44376,  // 2091-07-01 00:00:00 GMT
-    47695,  // 2100-08-02 00:00:00 GMT
-    66068,  // 2150-11-21 00:00:00 GMT
-    22270,  // 2030-12-22 00:00:00 GMT
-  };
-  ASSERT_EQ(column_view{lhs_col}.nullable(), false);
-  cudf::test::fixed_width_column_wrapper<TypeRhs, TypeRhs::rep> rhs_col{
-    0,      // This is the UNIX epoch - 1970-01-01
-    44380,  // Mismatched
-    47695,  // 2100-08-02 00:00:00 GMT
-    66070,  // Mismatched
-    22270,  // 2030-12-22 00:00:00 GMT
-  };
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col,
-                                 fixed_width_column_wrapper<bool>{
-                                   {true, false, true, false, true},
-                                   {true, true, true, true, true},
-                                 },
-                                 verbosity);
-}
-
-// Both vectors with mixed validity
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_MixMix)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {true, false, true, true, true, false, true});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"foo", "valid", "<null>", "", "invalid", "inv", "ééé"},
-                                       {true, true, true, true, false, false, true});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, true, true, false, true, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_MixValid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {true, false, true, true, true, false, true});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, false, true, true, true, false, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_MixInvalid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {true, false, true, true, true, false, true});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {false, false, false, false, false, false, false});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, true, false, false, false, true, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_ValidValid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, true, true, true, true, true, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_ValidInvalid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {false, false, false, false, false, false, false});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_InvalidInvalid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {false, false, false, false, false, false, false});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {false, false, false, false, false, false, false});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, true, true, true, true, true, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_VectorAllInvalid_B8_SI32_SI32)
-{
-  using TypeOut = bool;
-  using TypeLhs = int32_t;
-
-  auto lhs_col = fixed_width_column_wrapper<TypeLhs>{{-INT32_MAX, -37, 0, 499, 44, INT32_MAX},
-                                                     {false, false, false, false, false, false}};
-  auto rhs_col = fixed_width_column_wrapper<TypeLhs>{{-47, 37, 12, 99, 4, -INT32_MAX},
-                                                     {false, false, false, false, false, false}};
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col,
-                                 fixed_width_column_wrapper<bool>{
-                                   {true, true, true, true, true, true},
-                                   {true, true, true, true, true, true},
-                                 },
-                                 verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_SI64_SI32_SI8)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-  using TypeRhs = int8_t;
-
-  auto int_col = fixed_width_column_wrapper<TypeLhs>{
-    {999, -37, 0, INT32_MAX},
-  };
-  auto int_scalar = cudf::scalar_type_t<TypeRhs>(77);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, int_scalar, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{77, -37, 0, 77}, {true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_FP64_SI32_SI64)
-{
-  using TypeOut = double;
-  using TypeLhs = int32_t;
-  using TypeRhs = int64_t;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {false, true, false, true, false, true, false}};
-  auto int_scalar = cudf::scalar_type_t<TypeRhs>(INT32_MAX);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_scalar, int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{
-      {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX},
-      {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_SI64_SI32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-  using TypeRhs = float;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {false, true, false, true, false, true, false}};
-  auto float_scalar = cudf::scalar_type_t<TypeRhs>(-3.14f);
-  float_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, float_scalar, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{0, -37, 0, INT32_MAX, 0, -4379, 0},
-                                        {false, true, false, true, false, true, false}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_SI8_SI8_FP32)
-{
-  using TypeOut = int8_t;
-  using TypeLhs = int8_t;
-  using TypeRhs = float;
-
-  auto int_col = fixed_width_column_wrapper<TypeLhs>{
-    {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
-  auto float_scalar = cudf::scalar_type_t<TypeRhs>(-3.14f);
-  float_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    float_scalar, int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{0, 0, 0, 0, 0, 0, 0},
-                                        {false, false, false, false, false, false, false}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Vector_SI64_SI32_SI8)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {false, false, false, false, false, false, false}};
-  auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
-    {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, another_int_col, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{0, 0, 0, 0, 0, 0, 0},
-                                        {false, false, false, false, false, false, false}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_SI64_SI32_SI8)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-
-  auto int_col = fixed_width_column_wrapper<TypeLhs>{
-    {999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55}, {true, true, true, true, true, true, true}};
-  auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
-    {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, another_int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Vector_tsD_tsD_tsD)
-{
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> lhs_col{
-    {
-      0,      // This is the UNIX epoch - 1970-01-01
-      44376,  // 2091-07-01 00:00:00 GMT
-      47695,  // 2100-08-02 00:00:00 GMT
-      66068,  // 2150-11-21 00:00:00 GMT
-      22270,  // 2030-12-22 00:00:00 GMT
-    },
-    {true, false, true, true, false}};
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> rhs_col{
-    {
-      0,      // This is the UNIX epoch - 1970-01-01
-      44380,  // Mismatched
-      47695,  // 2100-08-02 00:00:00 GMT
-      66070,  // Mismatched
-      22270,  // 2030-12-22 00:00:00 GMT
-    },
-    {false, true, true, true, false}};
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_MIN, data_type(type_to_id<cudf::timestamp_D>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {0, 44380, 47695, 66068, 0}, {true, true, true, true, false}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_SI32_SI64_SI8)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = int64_t;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {false, false, false, false, false, false, false}};
-  auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
-    {9, -37, 0, 32, -47, -4, 55}, {true, false, true, false, true, false, true}};
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, another_int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{9, 0, 0, 0, -47, 0, 55},
-                                        {true, false, true, false, true, false, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_string_string_string_Mix)
-{
-  auto lhs_col = cudf::test::strings_column_wrapper(
-    {"eee", "invalid", "<null>", "", "", "", "ééé", "foo", "bar", "abc", "def"},
-    {false, true, true, false, true, true, true, false, false, true, true});
-  auto rhs_col = cudf::test::strings_column_wrapper(
-    {"eee", "goo", "<null>", "", "", "", "ééé", "bar", "foo", "def", "abc"},
-    {false, true, true, true, false, true, true, false, false, true, true});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_MAX, data_type{type_id::STRING});
-
-  auto exp_col = cudf::test::strings_column_wrapper(
-    {"", "invalid", "<null>", "", "", "", "ééé", "", "", "def", "def"},
-    {false, true, true, true, true, true, true, false, false, true, true});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col, exp_col, verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_string_string_string_Mix)
-{
-  auto lhs_col = cudf::test::strings_column_wrapper(
-    {"eee", "invalid", "<null>", "", "", "", "ééé", "foo", "bar", "abc", "foo"},
-    {false, true, true, false, true, true, true, false, false, true, true});
-  cudf::string_scalar str_scalar("foo");
-
-  // Returns a non-nullable column as all elements are valid - it will have the scalar
-  // value at the very least
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, str_scalar, cudf::binary_operator::NULL_MIN, data_type{type_id::STRING});
-
-  auto exp_col = cudf::test::strings_column_wrapper(
-    {"foo", "foo", "<null>", "foo", "", "", "foo", "foo", "foo", "abc", "foo"});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col, exp_col, verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_string_string_string_Mix)
-{
-  auto lhs_col = cudf::test::strings_column_wrapper(
-    {"eee", "invalid", "<null>", "", "", "", "ééé", "foo", "bar", "abc", "foo"},
-    {false, true, true, false, true, true, true, false, false, true, true});
-  cudf::string_scalar str_scalar("foo");
-  str_scalar.set_valid_async(false);
-
-  // Returns the lhs_col
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, lhs_col, cudf::binary_operator::NULL_MAX, data_type{type_id::STRING});
-
-  auto exp_col = cudf::test::strings_column_wrapper(
-    {"", "invalid", "<null>", "", "", "", "ééé", "", "", "abc", "foo"},
-    {false, true, true, false, true, true, true, false, false, true, true});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col, exp_col, verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, CastAdd_Vector_Vector_SI32_float_float)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = float;
-  using TypeRhs = float;  // Integral types promoted to double
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs      = cudf::test::fixed_width_column_wrapper<float>{1.3f, 1.6f};
-  auto rhs      = cudf::test::fixed_width_column_wrapper<float>{1.3f, 1.6f};
-  auto expected = cudf::test::fixed_width_column_wrapper<int>{2, 3};
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_TimepointD_DurationS_TimepointUS)
-{
-  using TypeOut = cudf::timestamp_us;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::duration_s;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Scalar_DurationD_TimepointS_TimepointS)
-{
-  using TypeOut = cudf::timestamp_s;
-  using TypeLhs = cudf::duration_D;
-  using TypeRhs = cudf::timestamp_s;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{34}, true);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Scalar_Vector_DurationS_DurationD_DurationMS)
-{
-  using TypeOut = cudf::duration_ms;
-  using TypeLhs = cudf::duration_s;
-  using TypeRhs = cudf::duration_D;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{-9});
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Scalar_Vector_SI64_SI64_SI32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int32_t;
-
-  using SHIFT_RIGHT_UNSIGNED =
-    cudf::library::operation::ShiftRightUnsigned<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(-12);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Scalar_Vector_FP32)
-{
-  using TypeOut = float;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(-86099.68377);
-  auto rhs = fixed_width_column_wrapper<TypeRhs>{{90770.74881, -15456.4335, 32213.22119}};
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  auto expected_result =
-    fixed_width_column_wrapper<TypeOut>{{4671.0625, -8817.51953125, 10539.974609375}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_result);
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Scalar_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = double;
-  using TypeRhs = double;
-
-  auto lhs = fixed_width_column_wrapper<TypeLhs>{{90770.74881, -15456.4335, 32213.22119}};
-  auto rhs = cudf::scalar_type_t<TypeRhs>(-86099.68377);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  auto expected_result = fixed_width_column_wrapper<TypeOut>{
-    {4671.0650400000013178, -15456.433499999999185, 32213.221190000000206}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_result);
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_FP64_FP32_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = float;
-  using TypeRhs = double;
-
-  auto lhs = fixed_width_column_wrapper<TypeLhs>{
-    {24854.55893, 79946.87288, -86099.68377, -86099.68377, 1.0, 1.0, -1.0, -1.0}};
-  auto rhs = fixed_width_column_wrapper<TypeRhs>{{90770.74881,
-                                                  -15456.4335,
-                                                  36223.96138,
-                                                  -15456.4335,
-                                                  2.1336193413893147E307,
-                                                  -2.1336193413893147E307,
-                                                  2.1336193413893147E307,
-                                                  -2.1336193413893147E307}};
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  auto expected_result = fixed_width_column_wrapper<TypeOut>{{24854.55859375,
-                                                              2664.7075000000040745,
-                                                              22572.196640000001935,
-                                                              -8817.5200000000040745,
-                                                              1.0,
-                                                              1.0,
-                                                              0.0,
-                                                              0.0}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_result);
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_FP64_SI32_SI64)
-{
-  using TypeOut = double;
-  using TypeLhs = int32_t;
-  using TypeRhs = int64_t;
-
-  using PMOD = cudf::library::operation::PMod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(1000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(1000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_SI64_SI32_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-  using TypeRhs = int64_t;
-
-  using PMOD = cudf::library::operation::PMod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(1000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(1000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_SI64_FP64_FP64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = double;
-  using TypeRhs = double;
-
-  using PMOD = cudf::library::operation::PMod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(1000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(1000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ATan2_Scalar_Vector_FP32)
-{
-  using TypeOut = float;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using ATAN2 = cudf::library::operation::ATan2<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
-
-  // atan2 has a max ULP error of 2 per CUDA programming guide
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Scalar_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = double;
-  using TypeRhs = double;
-
-  using ATAN2 = cudf::library::operation::ATan2<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
-
-  // atan2 has a max ULP error of 2 per CUDA programming guide
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Vector_FP64_FP32_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = float;
-  using TypeRhs = double;
-
-  using ATAN2 = cudf::library::operation::ATan2<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
-
-  // atan2 has a max ULP error of 2 per CUDA programming guide
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Vector_FP64_SI32_SI64)
-{
-  using TypeOut = double;
-  using TypeLhs = int32_t;
-  using TypeRhs = int64_t;
-
-  using ATAN2 = cudf::library::operation::ATan2<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
-
-  // atan2 has a max ULP error of 2 per CUDA programming guide
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
-}
-
-template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
-};
-
-template <typename T>
-using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-
-  auto const sz = std::size_t{1000};
-
-  auto begin      = cudf::detail::make_counting_transform_iterator(1, [](auto i) {
-    return decimalXX{i, scale_type{0}};
-  });
-  auto const vec1 = std::vector<decimalXX>(begin, begin + sz);
-  auto const vec2 = std::vector<decimalXX>(sz, decimalXX{2, scale_type{0}});
-  auto expected   = std::vector<decimalXX>(sz);
-
-  std::transform(std::cbegin(vec1),
-                 std::cend(vec1),
-                 std::cbegin(vec2),
-                 std::begin(expected),
-                 std::plus<decimalXX>());
-
-  auto const lhs          = wrapper<decimalXX>(vec1.begin(), vec1.end());
-  auto const rhs          = wrapper<decimalXX>(vec2.begin(), vec2.end());
-  auto const expected_col = wrapper<decimalXX>(expected.begin(), expected.end());
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-
-  auto const sz = std::size_t{1000};
-
-  auto begin      = cudf::detail::make_counting_transform_iterator(1, [](auto i) {
-    return decimalXX{i, scale_type{0}};
-  });
-  auto const vec1 = std::vector<decimalXX>(begin, begin + sz);
-  auto const vec2 = std::vector<decimalXX>(sz, decimalXX{2, scale_type{0}});
-  auto expected   = std::vector<decimalXX>(sz);
-
-  std::transform(std::cbegin(vec1),
-                 std::cend(vec1),
-                 std::cbegin(vec2),
-                 std::begin(expected),
-                 std::multiplies<decimalXX>());
-
-  auto const lhs          = wrapper<decimalXX>(vec1.begin(), vec1.end());
-  auto const rhs          = wrapper<decimalXX>(vec2.begin(), vec2.end());
-  auto const expected_col = wrapper<decimalXX>(expected.begin(), expected.end());
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
-}
-
-template <typename T>
-using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply2)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const rhs      = fp_wrapper<RepType>{{10, 10, 10, 10, 10}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{110, 220, 330, 440, 550}, scale_type{-1}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
-  auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{2, 7, 12, 17}, scale_type{-1}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv2)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
-  auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{-2}};
-  auto const expected = fp_wrapper<RepType>{{2, 7, 12, 17}, scale_type{1}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv3)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(12, scale_type{-1});
-  auto const expected = fp_wrapper<RepType>{{0, 2, 4, 5}, scale_type{0}};
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv4)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto begin = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 11; });
-  auto result_begin =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i * 11) / 12; });
-  auto const lhs      = fp_wrapper<RepType>(begin, begin + 1000, scale_type{-1});
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(12, scale_type{-1});
-  auto const expected = fp_wrapper<RepType>(result_begin, result_begin + 1000, scale_type{0});
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd2)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const rhs      = fp_wrapper<RepType>{{100, 200, 300, 400, 500}, scale_type{-2}};
-  auto const expected = fp_wrapper<RepType>{{210, 420, 630, 840, 1050}, scale_type{-2}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd3)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{1100, 2200, 3300, 4400, 5500}, scale_type{-3}};
-  auto const rhs      = fp_wrapper<RepType>{{100, 200, 300, 400, 500}, scale_type{-2}};
-  auto const expected = fp_wrapper<RepType>{{2100, 4200, 6300, 8400, 10500}, scale_type{-3}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd4)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-2});
-  auto const expected = fp_wrapper<RepType>{{210, 320, 430, 540, 650}, scale_type{-2}};
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::ADD, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd5)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-2});
-  auto const rhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const expected = fp_wrapper<RepType>{{210, 320, 430, 540, 650}, scale_type{-2}};
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::ADD, lhs->type(), static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd6)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col = fp_wrapper<RepType>{{3, 4, 5, 6, 7, 8}, scale_type{0}};
-
-  auto const expected1 = fp_wrapper<RepType>{{6, 8, 10, 12, 14, 16}, scale_type{0}};
-  auto const expected2 = fp_wrapper<RepType>{{0, 0, 1, 1, 1, 1}, scale_type{1}};
-  auto const type1     = cudf::data_type{cudf::type_to_id<decimalXX>(), 0};
-  auto const type2     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
-  auto const result1   = cudf::jit::binary_operation(col, col, cudf::binary_operator::ADD, type1);
-  auto const result2   = cudf::jit::binary_operation(col, col, cudf::binary_operator::ADD, type2);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointCast)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col      = fp_wrapper<RepType>{{6, 8, 10, 12, 14, 16}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{0, 0, 1, 1, 1, 1}, scale_type{1}};
-  auto const type     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
-  auto const result   = cudf::cast(col, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiplyScalar)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-1});
-  auto const expected = fp_wrapper<RepType>{{1100, 2200, 3300, 4400, 5500}, scale_type{-2}};
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::MUL, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpSimplePlus)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{150, 200}, scale_type{-2}};
-  auto const rhs      = fp_wrapper<RepType>{{2250, 1005}, scale_type{-3}};
-  auto const expected = fp_wrapper<RepType>{{3750, 3005}, scale_type{-3}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimple)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, scale_type{0}};
-  auto const col2     = fp_wrapper<RepType>{{100, 200, 300, 400}, scale_type{-2}};
-  auto const expected = wrapper<bool>(trues.begin(), trues.end());
-
-  auto const result = cudf::jit::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col      = fp_wrapper<RepType>{{1, 2, 3, 4}, scale_type{0}};
-  auto const expected = wrapper<bool>(trues.begin(), trues.end());
-
-  auto const result =
-    cudf::jit::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0Null)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{0}};
-  auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
-  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
-
-  auto const result = cudf::jit::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale2Null)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{-2}};
-  auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
-  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
-
-  auto const result = cudf::jit::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualLessGreater)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const sz = std::size_t{1000};
-
-  // TESTING binary op ADD
-
-  auto begin = cudf::detail::make_counting_transform_iterator(1, [](auto e) { return e * 1000; });
-  auto const vec1 = std::vector<RepType>(begin, begin + sz);
-  auto const vec2 = std::vector<RepType>(sz, 0);
-
-  auto const iota_3  = fp_wrapper<RepType>(vec1.begin(), vec1.end(), scale_type{-3});
-  auto const zeros_3 = fp_wrapper<RepType>(vec2.begin(), vec2.end(), scale_type{-1});
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(iota_3).type(),
-                                                   static_cast<cudf::column_view>(zeros_3).type());
-  auto const iota_3_after_add =
-    cudf::jit::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(iota_3, iota_3_after_add->view());
-
-  // TESTING binary op EQUAL, LESS, GREATER
-
-  auto const trues    = std::vector<bool>(sz, true);
-  auto const true_col = wrapper<bool>(trues.begin(), trues.end());
-
-  auto const btype = cudf::data_type{type_id::BOOL8};
-  auto const equal_result =
-    cudf::jit::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, equal_result->view());
-
-  auto const less_result =
-    cudf::jit::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, less_result->view());
-
-  auto const greater_result =
-    cudf::jit::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, greater_result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMaxSimple)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col1     = fp_wrapper<RepType>{{40, 30, 20, 10, 0}, {1, 0, 1, 1, 0}, scale_type{-2}};
-  auto const col2     = fp_wrapper<RepType>{{10, 20, 30, 40, 0}, {1, 1, 1, 0, 0}, scale_type{-2}};
-  auto const expected = fp_wrapper<RepType>{{40, 20, 30, 10, 0}, {1, 1, 1, 1, 0}, scale_type{-2}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MAX,
-                                                   static_cast<cudf::column_view>(col1).type(),
-                                                   static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::jit::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMinSimple)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col1     = fp_wrapper<RepType>{{40, 30, 20, 10, 0}, {1, 1, 1, 0, 0}, scale_type{-1}};
-  auto const col2     = fp_wrapper<RepType>{{10, 20, 30, 40, 0}, {1, 0, 1, 1, 0}, scale_type{-1}};
-  auto const expected = fp_wrapper<RepType>{{10, 30, 20, 40, 0}, {1, 1, 1, 1, 0}, scale_type{-1}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MIN,
-                                                   static_cast<cudf::column_view>(col1).type(),
-                                                   static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::jit::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullEqualsSimple)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col1     = fp_wrapper<RepType>{{400, 300, 300, 100}, {1, 1, 1, 0}, scale_type{-2}};
-  auto const col2     = fp_wrapper<RepType>{{40, 200, 20, 400}, {1, 0, 1, 0}, scale_type{-1}};
-  auto const expected = wrapper<bool>{{1, 0, 0, 1}, {1, 1, 1, 1}};
-
-  auto const result = cudf::jit::binary_operation(
-    col1, col2, binary_operator::NULL_EQUALS, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
-  auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{25, 75, 125, 175}, scale_type{-2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div2)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{100000, 300000, 500000, 700000}, scale_type{-3}};
-  auto const rhs      = fp_wrapper<RepType>{{20, 20, 20, 20}, scale_type{-1}};
-  auto const expected = fp_wrapper<RepType>{{5000, 15000, 25000, 35000}, scale_type{-2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div3)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10000, 30000, 50000, 70000}, scale_type{-2}};
-  auto const rhs      = fp_wrapper<RepType>{{3, 9, 3, 3}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{3333, 3333, 16666, 23333}, scale_type{-2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div4)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(3, scale_type{0});
-  auto const expected = fp_wrapper<RepType>{{3, 10, 16, 23}, scale_type{1}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div6)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs = make_fixed_point_scalar<decimalXX>(3000, scale_type{-3});
-  auto const rhs = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
-
-  auto const expected = fp_wrapper<RepType>{{300, 100, 60, 42}, scale_type{-2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div7)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs = make_fixed_point_scalar<decimalXX>(1200, scale_type{0});
-  auto const rhs = fp_wrapper<RepType>{{100, 200, 300, 500, 600, 800, 1200, 1300}, scale_type{-2}};
-
-  auto const expected = fp_wrapper<RepType>{{12, 6, 4, 2, 2, 1, 1, 0}, scale_type{2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div8)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{4000, 6000, 80000}, scale_type{-1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(5000, scale_type{-3});
-  auto const expected = fp_wrapper<RepType>{{0, 1, 16}, scale_type{2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div9)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 20, 30}, scale_type{2}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{1});
-  auto const expected = fp_wrapper<RepType>{{1, 2, 4}, scale_type{1}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div10)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{100, 200, 300}, scale_type{1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{0});
-  auto const expected = fp_wrapper<RepType>{{14, 28, 42}, scale_type{1}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div11)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{1000, 2000, 3000}, scale_type{1}};
-  auto const rhs      = fp_wrapper<RepType>{{7, 7, 7}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{142, 285, 428}, scale_type{1}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpThrows)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col           = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
-  auto const non_bool_type = data_type{type_to_id<decimalXX>(), -2};
-  auto const float_type    = data_type{type_id::FLOAT32};
-  EXPECT_THROW(cudf::jit::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::jit::binary_operation(col, col, cudf::binary_operator::MUL, float_type),
-               cudf::logic_error);
-}
-
-}  // namespace binop
-}  // namespace test
-}  // namespace cudf
-
-CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/binaryop/binop-null-test.cpp b/cpp/tests/binaryop/binop-null-test.cpp
index 25ec3b30834..55ddde5ce5f 100644
--- a/cpp/tests/binaryop/binop-null-test.cpp
+++ b/cpp/tests/binaryop/binop-null-test.cpp
@@ -23,6 +23,8 @@
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
 
+#include <tests/binaryop/util/runtime_support.h>
+
 namespace cudf {
 namespace test {
 namespace binop {
@@ -52,6 +54,12 @@ struct BinaryOperationNullTest : public BinaryOperationTest {
       default: CUDF_FAIL("Unknown mask state " + std::to_string(static_cast<int64_t>(state)));
     }
   }
+
+ protected:
+  void SetUp() override
+  {
+    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
+  }
 };  // namespace binop
 
 TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_Valid)
@@ -66,8 +74,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_Valid)
   lhs.set_valid_async(false);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_VALID);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -83,8 +91,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Valid_Vector_NonNullable)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -101,8 +109,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_NonNullable)
   lhs.set_valid_async(false);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -118,8 +126,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Scalar_Valid)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_NULL);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -135,8 +143,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Vector_Valid)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_NULL);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_VALID);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -152,8 +160,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_NULL);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -169,8 +177,8 @@ TEST_F(BinaryOperationNullTest, Vector_Valid_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_VALID);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -186,8 +194,8 @@ TEST_F(BinaryOperationNullTest, Vector_NonNullable_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::UNALLOCATED);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
diff --git a/cpp/tests/binaryop/binop-verify-input-test.cpp b/cpp/tests/binaryop/binop-verify-input-test.cpp
index 779dc7c4c1f..167fbc22bde 100644
--- a/cpp/tests/binaryop/binop-verify-input-test.cpp
+++ b/cpp/tests/binaryop/binop-verify-input-test.cpp
@@ -35,9 +35,9 @@ TEST_F(BinopVerifyInputTest, Vector_Scalar_ErrorOutputVectorType)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
 
-  EXPECT_THROW(cudf::jit::binary_operation(
-                 lhs, rhs, cudf::binary_operator::ADD, data_type(type_id::NUM_TYPE_IDS)),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_id::NUM_TYPE_IDS)),
+    cudf::logic_error);
 }
 
 TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
@@ -49,9 +49,9 @@ TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
   auto lhs = make_random_wrapped_column<TypeLhs>(1);
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
 
-  EXPECT_THROW(cudf::jit::binary_operation(
-                 lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>())),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>())),
+    cudf::logic_error);
 }
 
 }  // namespace binop
diff --git a/cpp/tests/binaryop/util/runtime_support.h b/cpp/tests/binaryop/util/runtime_support.h
new file mode 100644
index 00000000000..250d34a0879
--- /dev/null
+++ b/cpp/tests/binaryop/util/runtime_support.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+inline bool can_do_runtime_jit()
+{
+  // We require a CUDA NVRTC of 11.5+ to do runtime jit
+  // as we need support for __int128
+
+  int runtime      = 0;
+  auto error_value = cudaRuntimeGetVersion(&runtime);
+  return (error_value == cudaSuccess) && (runtime >= 11050);
+}
diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp
index ab324ea8505..4afa96f08d7 100644
--- a/cpp/tests/column/column_view_shallow_test.cpp
+++ b/cpp/tests/column/column_view_shallow_test.cpp
@@ -84,7 +84,7 @@ TYPED_TEST_SUITE(ColumnViewShallowTests, AllTypes);
 // Test for fixed_width, dict, string, list, struct
 // column_view, column_view = same hash.
 // column_view, make a copy = same hash.
-// new column_view from colmn = same hash
+// new column_view from column = same hash
 // column_view, copy column = diff hash
 // column_view, diff column = diff hash.
 //
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index d2279580c58..306037e6473 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -1549,15 +1549,15 @@ TEST_F(ListsColumnTest, ListOfStructs)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 struct FixedPointTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointConcatentate)
+TYPED_TEST(FixedPointTestAllReps, FixedPointConcatentate)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index e4846d4b2c6..28ebb6cbcb6 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -899,14 +899,14 @@ TEST_F(BooleanMaskScatterScalarFails, NumberOfColumnAndScalarMismatch)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointScatter)
+TYPED_TEST(FixedPointTestAllReps, FixedPointScatter)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index c0d2d1cc447..4ac24317145 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -183,10 +183,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedDatetimeComponents)
   using namespace cudf::datetime;
   using namespace cuda::std::chrono;
 
-  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
-  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
-  auto timestamps =
-    generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop_));
+  auto start      = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
+  auto stop       = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto timestamps = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
 
   auto expected_years =
     fixed_width_column_wrapper<int16_t>{1890, 1906, 1922, 1938, 1954, 1970, 1985, 2001, 2017, 2033};
@@ -221,9 +220,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedNullableDatetimeComponen
   using namespace cuda::std::chrono;
 
   auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
-  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
   auto timestamps =
-    generate_timestamps<T, true>(this->size(), time_point_ms(start), time_point_ms(stop_));
+    generate_timestamps<T, true>(this->size(), time_point_ms(start), time_point_ms(stop));
 
   auto expected_years = fixed_width_column_wrapper<int16_t>{
     {1890, 1906, 1922, 1938, 1954, 1970, 1985, 2001, 2017, 2033},
@@ -357,9 +356,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime)
   using namespace cuda::std::chrono;
 
   auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
-  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
 
-  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop_));
+  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
 
   auto host_val                     = to_host<T>(input);
   thrust::host_vector<T> timestamps = host_val.first;
@@ -403,6 +402,22 @@ TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime)
   auto expected_millisecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
     ceiled_millisecond.begin(), ceiled_millisecond.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_millisecond(input), expected_millisecond);
+
+  std::vector<T> ceiled_microsecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_microsecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<microseconds>(i));
+  });
+  auto expected_microsecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_microsecond.begin(), ceiled_microsecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_microsecond(input), expected_microsecond);
+
+  std::vector<T> ceiled_nanosecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_nanosecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<nanoseconds>(i));
+  });
+  auto expected_nanosecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_nanosecond.begin(), ceiled_nanosecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_nanosecond(input), expected_nanosecond);
 }
 
 TEST_F(BasicDatetimeOpsTest, TestDayOfYearWithDate)
@@ -742,7 +757,7 @@ TEST_F(BasicDatetimeOpsTest, TestIsLeapYear)
         707904541L,     // 1992-06-07 08:09:01 GMT - leap year
         -2181005247L,   // 1900-11-20 09:12:33 GMT - non leap year
         0L,             // UNIX EPOCH 1970-01-01 00:00:00 GMT - non leap year
-        -12212553600L,  // First full year of Gregorian Calandar 1583-01-01 00:00:00 - non-leap-year
+        -12212553600L,  // First full year of Gregorian Calendar 1583-01-01 00:00:00 - non-leap-year
         0L,             // null
         13591632822L,   // 2400-09-13 13:33:42 GMT - leap year
         4539564243L,    // 2113-11-08 06:04:03 GMT - non leap year
@@ -811,7 +826,7 @@ TEST_F(BasicDatetimeOpsTest, TestQuarter)
         707904541L,     // 1992-06-07 08:09:01 GMT
         -2181005247L,   // 1900-11-20 09:12:33 GMT
         0L,             // UNIX EPOCH 1970-01-01 00:00:00 GMT
-        -12212553600L,  // First full year of Gregorian Calandar 1583-01-01 00:00:00
+        -12212553600L,  // First full year of Gregorian Calendar 1583-01-01 00:00:00
         0L,             // null
         13591632822L,   // 2400-09-13 13:33:42 GMT
         4539564243L,    // 2113-11-08 06:04:03 GMT
@@ -827,4 +842,76 @@ TEST_F(BasicDatetimeOpsTest, TestQuarter)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_quarter(timestamps_s), quarter);
 }
 
+TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
+{
+  using T = TypeParam;
+  using namespace cudf::test;
+  using namespace cudf::datetime;
+  using namespace cuda::std::chrono;
+
+  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
+  auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+
+  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
+
+  auto host_val                     = to_host<T>(input);
+  thrust::host_vector<T> timestamps = host_val.first;
+
+  std::vector<T> floored_day(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_day.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<days>(i));
+  });
+  auto expected_day = fixed_width_column_wrapper<T, typename T::duration::rep>(floored_day.begin(),
+                                                                               floored_day.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_day(input), expected_day);
+
+  std::vector<T> floored_hour(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_hour.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<hours>(i));
+  });
+  auto expected_hour = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_hour.begin(), floored_hour.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_hour(input), expected_hour);
+
+  std::vector<T> floored_minute(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_minute.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<minutes>(i));
+  });
+  auto expected_minute = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_minute.begin(), floored_minute.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_minute(input), expected_minute);
+
+  std::vector<T> floored_second(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_second.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<seconds>(i));
+  });
+  auto expected_second = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_second.begin(), floored_second.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_second(input), expected_second);
+
+  std::vector<T> floored_millisecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_millisecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<milliseconds>(i));
+  });
+  auto expected_millisecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_millisecond.begin(), floored_millisecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_millisecond);
+
+  std::vector<T> floored_microsecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_microsecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<microseconds>(i));
+  });
+  auto expected_microsecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_microsecond.begin(), floored_microsecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_second(input), expected_microsecond);
+
+  std::vector<T> floored_nanosecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_nanosecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<nanoseconds>(i));
+  });
+  auto expected_nanosecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_nanosecond.begin(), floored_nanosecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_nanosecond);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp
index 6ced6e545d5..f305d4a06c7 100644
--- a/cpp/tests/filling/fill_tests.cpp
+++ b/cpp/tests/filling/fill_tests.cpp
@@ -363,4 +363,47 @@ TEST_F(FillErrorTestFixture, DTypeMismatch)
   EXPECT_THROW(auto p_ret = cudf::fill(destination, 0, 10, *p_val), cudf::logic_error);
 }
 
+template <typename T>
+class FixedPointAllReps : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_SUITE(FixedPointAllReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointAllReps, OutOfPlaceFill)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3, -4}) {
+    auto const scale    = scale_type{i};
+    auto const column   = fp_wrapper{{4104, 42, 1729, 55}, scale};
+    auto const expected = fp_wrapper{{42, 42, 42, 42}, scale};
+    auto const scalar   = cudf::make_fixed_point_scalar<decimalXX>(42, scale);
+
+    auto const result = cudf::fill(column, 0, 4, *scalar);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
+  }
+}
+
+TYPED_TEST(FixedPointAllReps, InPlaceFill)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3, -4}) {
+    auto const scale    = scale_type{i};
+    auto column         = fp_wrapper{{4104, 42, 1729, 55}, scale};
+    auto const expected = fp_wrapper{{42, 42, 42, 42}, scale};
+    auto const scalar   = cudf::make_fixed_point_scalar<decimalXX>(42, scale);
+
+    auto mut_column = cudf::mutable_column_view{column};
+    cudf::fill_in_place(mut_column, 0, 4, *scalar);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(column, expected);
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index 339585756c9..d01fec3e173 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -35,14 +35,14 @@ struct FixedPointTest : public cudf::test::BaseFixture {
 };
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, RepresentationTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, RepresentationTypes);
 
-TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXConstruction)
+TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -63,7 +63,7 @@ TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXConstruction)
   EXPECT_EQ(1.234567, static_cast<double>(num6));
 }
 
-TYPED_TEST(FixedPointTestBothReps, SimpleNegativeDecimalXXConstruction)
+TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -84,7 +84,7 @@ TYPED_TEST(FixedPointTestBothReps, SimpleNegativeDecimalXXConstruction)
   EXPECT_EQ(-1.234567, static_cast<double>(num6));
 }
 
-TYPED_TEST(FixedPointTestBothReps, PaddedDecimalXXConstruction)
+TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -109,7 +109,7 @@ TYPED_TEST(FixedPointTestBothReps, PaddedDecimalXXConstruction)
   EXPECT_EQ(0.000123, static_cast<double>(y));
 }
 
-TYPED_TEST(FixedPointTestBothReps, SimpleBinaryFPConstruction)
+TYPED_TEST(FixedPointTestAllReps, SimpleBinaryFPConstruction)
 {
   using binary_fp = fixed_point<TypeParam, Radix::BASE_2>;
 
@@ -138,7 +138,7 @@ TYPED_TEST(FixedPointTestBothReps, SimpleBinaryFPConstruction)
   EXPECT_EQ(1.4375, static_cast<double>(num9));
 }
 
-TYPED_TEST(FixedPointTestBothReps, MoreSimpleBinaryFPConstruction)
+TYPED_TEST(FixedPointTestAllReps, MoreSimpleBinaryFPConstruction)
 {
   using binary_fp = fixed_point<TypeParam, Radix::BASE_2>;
 
@@ -149,7 +149,7 @@ TYPED_TEST(FixedPointTestBothReps, MoreSimpleBinaryFPConstruction)
   EXPECT_EQ(2.0625, static_cast<double>(num1));
 }
 
-TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXMath)
+TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -174,7 +174,7 @@ TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXMath)
   EXPECT_EQ(a - b, a);
 }
 
-TYPED_TEST(FixedPointTestBothReps, ComparisonOperators)
+TYPED_TEST(FixedPointTestAllReps, ComparisonOperators)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -193,7 +193,7 @@ TYPED_TEST(FixedPointTestBothReps, ComparisonOperators)
   EXPECT_TRUE(SIX / TWO >= ONE);
 }
 
-TYPED_TEST(FixedPointTestBothReps, DecimalXXTrickyDivision)
+TYPED_TEST(FixedPointTestAllReps, DecimalXXTrickyDivision)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -223,7 +223,7 @@ TYPED_TEST(FixedPointTestBothReps, DecimalXXTrickyDivision)
   EXPECT_EQ(static_cast<int32_t>(n), 20);
 }
 
-TYPED_TEST(FixedPointTestBothReps, DecimalXXRounding)
+TYPED_TEST(FixedPointTestAllReps, DecimalXXRounding)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -251,7 +251,7 @@ TYPED_TEST(FixedPointTestBothReps, DecimalXXRounding)
   EXPECT_TRUE(FIVE_0 * THREE_0 != TEN_1);
 }
 
-TYPED_TEST(FixedPointTestBothReps, ArithmeticWithDifferentScales)
+TYPED_TEST(FixedPointTestAllReps, ArithmeticWithDifferentScales)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -276,7 +276,7 @@ TYPED_TEST(FixedPointTestBothReps, ArithmeticWithDifferentScales)
   EXPECT_EQ(c - d, zz);
 }
 
-TYPED_TEST(FixedPointTestBothReps, RescaledTest)
+TYPED_TEST(FixedPointTestAllReps, RescaledTest)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -296,7 +296,7 @@ TYPED_TEST(FixedPointTestBothReps, RescaledTest)
   EXPECT_EQ(num5, num6.rescaled(scale_type{-5}));
 }
 
-TYPED_TEST(FixedPointTestBothReps, RescaledRounding)
+TYPED_TEST(FixedPointTestAllReps, RescaledRounding)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -311,7 +311,7 @@ TYPED_TEST(FixedPointTestBothReps, RescaledRounding)
   EXPECT_EQ(-1000, static_cast<TypeParam>(num3.rescaled(scale_type{3})));
 }
 
-TYPED_TEST(FixedPointTestBothReps, BoolConversion)
+TYPED_TEST(FixedPointTestAllReps, BoolConversion)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
@@ -468,7 +468,7 @@ struct cast_to_int32_fn {
   int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast<int32_t>(fp); }
 };
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointColumnWrapper)
+TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper)
 {
   using namespace numeric;
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
@@ -489,7 +489,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointColumnWrapper)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, w);
 }
 
-TYPED_TEST(FixedPointTestBothReps, NoScaleOrWrongTypeID)
+TYPED_TEST(FixedPointTestAllReps, NoScaleOrWrongTypeID)
 {
   auto null_mask = cudf::create_null_mask(0, cudf::mask_state::ALL_NULL);
 
@@ -498,7 +498,7 @@ TYPED_TEST(FixedPointTestBothReps, NoScaleOrWrongTypeID)
     cudf::logic_error);
 }
 
-TYPED_TEST(FixedPointTestBothReps, SimpleFixedPointColumnWrapper)
+TYPED_TEST(FixedPointTestAllReps, SimpleFixedPointColumnWrapper)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
@@ -524,8 +524,8 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType32)
   auto const expected2 = fp_wrapper{{50000000}, scale_type{6}};
 
   auto const type    = cudf::data_type{cudf::type_id::DECIMAL32, 6};
-  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type);
-  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type);
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -547,14 +547,14 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64)
   auto const expected2 = fp_wrapper{{50000000}, scale_type{100}};
 
   auto const type    = cudf::data_type{cudf::type_id::DECIMAL64, 100};
-  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type);
-  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type);
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
+TYPED_TEST(FixedPointTestAllReps, ExtremelyLargeNegativeScale)
 {
   // This is testing fixed_point values with an extremely large negative scale. The fixed_point
   // implementation should be able to handle any scale representable by an int32_t
@@ -570,10 +570,10 @@ TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
   auto const expected2 = fp_wrapper{{5}, scale_type{-201}};
 
   auto const type1   = cudf::data_type{cudf::type_to_id<decimalXX>(), -202};
-  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type1);
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type1);
 
   auto const type2   = cudf::data_type{cudf::type_to_id<decimalXX>(), -201};
-  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type2);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type2);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index c650a7191be..51610949f6f 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -40,14 +40,14 @@ struct FixedPointTest : public cudf::test::BaseFixture {
 };
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
+using RepresentationTypes = ::testing::Types<int32_t, int64_t, __int128_t>;
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, RepresentationTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, RepresentationTypes);
 
-TYPED_TEST(FixedPointTestBothReps, DecimalXXThrust)
+TYPED_TEST(FixedPointTestAllReps, DecimalXXThrust)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
diff --git a/cpp/tests/groupby/count_scan_tests.cpp b/cpp/tests/groupby/count_scan_tests.cpp
index 062efe6094e..164e967e28e 100644
--- a/cpp/tests/groupby/count_scan_tests.cpp
+++ b/cpp/tests/groupby/count_scan_tests.cpp
@@ -156,12 +156,12 @@ TEST_F(groupby_count_scan_string_test, basic)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupByCountScan)
+TYPED_TEST(FixedPointTestAllReps, GroupByCountScan)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/count_tests.cpp b/cpp/tests/groupby/count_tests.cpp
index c70f3fd942a..c1cabe3fb08 100644
--- a/cpp/tests/groupby/count_tests.cpp
+++ b/cpp/tests/groupby/count_tests.cpp
@@ -169,12 +169,12 @@ TEST_F(groupby_count_string_test, basic)
 // clang-format on
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupByCount)
+TYPED_TEST(FixedPointTestAllReps, GroupByCount)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp
index bb2f87fd424..196aeed0430 100644
--- a/cpp/tests/groupby/max_scan_tests.cpp
+++ b/cpp/tests/groupby/max_scan_tests.cpp
@@ -145,12 +145,12 @@ TEST_F(groupby_max_scan_string_test, basic)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxScanDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMaxScanDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index 8d15401aa09..983802cb9a2 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -252,12 +252,12 @@ TEST_F(groupby_dictionary_max_test, fixed_width)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMaxDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -281,7 +281,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxDecimalAsValue)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, GroupByHashMaxDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupByHashMaxDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/min_scan_tests.cpp b/cpp/tests/groupby/min_scan_tests.cpp
index 06c0f5ceb3b..e4c018a9ce1 100644
--- a/cpp/tests/groupby/min_scan_tests.cpp
+++ b/cpp/tests/groupby/min_scan_tests.cpp
@@ -143,12 +143,12 @@ TEST_F(groupby_min_scan_string_test, basic)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMinScanDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMinScanDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index c2cfca83b29..aca3384768c 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -252,12 +252,12 @@ TEST_F(groupby_dictionary_min_test, fixed_width)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMinDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortMinDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -280,7 +280,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMinDecimalAsValue)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, GroupByHashMinDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupByHashMinDecimalAsValue)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
index 2de32b70d14..3117f8b1557 100644
--- a/cpp/tests/groupby/sum_scan_tests.cpp
+++ b/cpp/tests/groupby/sum_scan_tests.cpp
@@ -133,18 +133,17 @@ TYPED_TEST(groupby_sum_scan_test, null_keys_and_values)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortSumScanDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortSumScanDecimalAsValue)
 {
   using namespace numeric;
-  using decimalXX      = TypeParam;
-  using RepType        = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper     = fixed_point_column_wrapper<RepType>;
-  using out_fp_wrapper = fixed_point_column_wrapper<int64_t>;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = fixed_point_column_wrapper<RepType>;
 
   for (auto const i : {2, 1, 0, -1, -2}) {
     auto const scale = scale_type{i};
@@ -152,8 +151,8 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortSumScanDecimalAsValue)
     auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
     auto const vals = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
 
-    auto const expect_keys     = key_wrapper    {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
-    auto const expect_vals_sum = out_fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale};
+    auto const expect_keys     = key_wrapper{1, 1, 1, 2, 2,  2,  2, 3, 3,  3};
+    auto const expect_vals_sum = fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale};
     // clang-format on
 
     auto agg2 = cudf::make_sum_aggregation<groupby_scan_aggregation>();
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index 1aa6358b7b2..5947e309bec 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -157,19 +157,18 @@ TYPED_TEST(groupby_sum_test, dictionary)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupBySortSumDecimalAsValue)
 {
   using namespace numeric;
-  using decimalXX    = TypeParam;
-  using RepType      = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
-  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
-  using K            = int32_t;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using K          = int32_t;
 
   for (auto const i : {2, 1, 0, -1, -2}) {
     auto const scale = scale_type{i};
@@ -179,7 +178,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue)
     // clang-format on
 
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
-    auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
+    auto const expect_vals_sum = fp_wrapper{{9, 19, 17}, scale};
 
     auto agg1 = cudf::make_sum_aggregation<groupby_aggregation>();
     test_single_agg(
@@ -192,14 +191,13 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, GroupByHashSumDecimalAsValue)
+TYPED_TEST(FixedPointTestAllReps, GroupByHashSumDecimalAsValue)
 {
   using namespace numeric;
-  using decimalXX    = TypeParam;
-  using RepType      = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
-  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
-  using K            = int32_t;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using K          = int32_t;
 
   for (auto const i : {2, 1, 0, -1, -2}) {
     auto const scale = scale_type{i};
@@ -209,7 +207,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashSumDecimalAsValue)
     // clang-format on
 
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
-    auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
+    auto const expect_vals_sum = fp_wrapper{{9, 19, 17}, scale};
 
     auto agg5 = cudf::make_sum_aggregation<groupby_aggregation>();
     test_single_agg(keys, vals, expect_keys, expect_vals_sum, std::move(agg5));
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 0633dfbf791..da44c91eec3 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -341,9 +341,12 @@ TEST_F(OrcWriterTest, MultiColumn)
   auto col3_data = random_values<int32_t>(num_rows);
   auto col4_data = random_values<float>(num_rows);
   auto col5_data = random_values<double>(num_rows);
-  auto col6_vals = random_values<int32_t>(num_rows);
+  auto col6_vals = random_values<int64_t>(num_rows);
   auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{col6_vals[i], numeric::scale_type{2}};
+    return numeric::decimal128{col6_vals[i], numeric::scale_type{12}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{col6_vals[i], numeric::scale_type{-12}};
   });
   auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
@@ -353,16 +356,17 @@ TEST_F(OrcWriterTest, MultiColumn)
   column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), validity};
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), validity};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
-  column_wrapper<numeric::decimal64> col6{col6_data, col6_data + num_rows, validity};
+  column_wrapper<numeric::decimal128> col6{col6_data, col6_data + num_rows, validity};
+  column_wrapper<numeric::decimal128> col7{col7_data, col7_data + num_rows, validity};
 
-  cudf::test::lists_column_wrapper<int64_t> col7{
+  cudf::test::lists_column_wrapper<int64_t> col8{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}};
 
   auto child_col =
     cudf::test::fixed_width_column_wrapper<int32_t>{48, 27, 25, 31, 351, 351, 29, 15, -1, -99};
-  auto col8 = cudf::test::structs_column_wrapper{child_col};
+  auto col9 = cudf::test::structs_column_wrapper{child_col};
 
-  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8, col9});
 
   cudf_io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("bools");
@@ -371,9 +375,10 @@ TEST_F(OrcWriterTest, MultiColumn)
   expected_metadata.column_metadata[3].set_name("int32s");
   expected_metadata.column_metadata[4].set_name("floats");
   expected_metadata.column_metadata[5].set_name("doubles");
-  expected_metadata.column_metadata[6].set_name("decimal");
-  expected_metadata.column_metadata[7].set_name("lists");
-  expected_metadata.column_metadata[8].set_name("structs");
+  expected_metadata.column_metadata[6].set_name("decimal_pos_scale");
+  expected_metadata.column_metadata[7].set_name("decimal_neg_scale");
+  expected_metadata.column_metadata[8].set_name("lists");
+  expected_metadata.column_metadata[9].set_name("structs");
 
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
   cudf_io::orc_writer_options out_opts =
@@ -382,7 +387,9 @@ TEST_F(OrcWriterTest, MultiColumn)
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .use_index(false)
+      .decimal128_columns({"decimal_pos_scale", "decimal_neg_scale"});
   auto result = cudf_io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
@@ -1185,7 +1192,6 @@ TEST_F(OrcWriterTest, Decimal32)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
   auto result = cudf_io::read_orc(in_opts);
 
-  // Need a 64bit decimal column for comparison since the reader always creates DECIMAL64 columns
   auto data64 = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
     return numeric::decimal64{vals[i], numeric::scale_type{2}};
   });
@@ -1407,4 +1413,90 @@ TEST_F(OrcReaderTest, NestedColumnSelection)
   ASSERT_EQ("field_b", result.metadata.schema_info[0].children[0].name);
 }
 
+TEST_F(OrcReaderTest, DecimalOptions)
+{
+  constexpr auto num_rows = 10;
+  auto col_vals           = random_values<int64_t>(num_rows);
+  auto col_data           = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{col_vals[i], numeric::scale_type{2}};
+  });
+  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 == 0; });
+
+  column_wrapper<numeric::decimal128> col{col_data, col_data + num_rows, mask};
+  table_view expected({col});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("dec");
+
+  auto filepath = temp_env->get_temp_filepath("OrcDecimalOptions.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options valid_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .decimal128_columns({"dec", "fake_name"})
+      .decimal_cols_as_float({"decc", "fake_name"});
+  // Should not throw
+  EXPECT_NO_THROW(cudf_io::read_orc(valid_opts));
+
+  cudf_io::orc_reader_options invalid_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .decimal128_columns({"dec", "fake_name"})
+      .decimal_cols_as_float({"dec", "fake_name"});
+  // Should throw, options overlap
+  EXPECT_THROW(cudf_io::read_orc(invalid_opts), cudf::logic_error);
+}
+
+TEST_F(OrcWriterTest, DecimalOptionsNested)
+{
+  auto const num_rows = 100;
+
+  auto dec_vals  = random_values<int32_t>(num_rows);
+  auto keys_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal64{dec_vals[i], numeric::scale_type{2}};
+  });
+  auto vals_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{dec_vals[i], numeric::scale_type{2}};
+  });
+  auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  column_wrapper<numeric::decimal64> keys_col{keys_data, keys_data + num_rows, validity};
+  column_wrapper<numeric::decimal128> vals_col{vals_data, vals_data + num_rows, validity};
+
+  auto struct_col = cudf::test::structs_column_wrapper({keys_col, vals_col}).release();
+
+  std::vector<int> row_offsets(num_rows + 1);
+  std::iota(row_offsets.begin(), row_offsets.end(), 0);
+  cudf::test::fixed_width_column_wrapper<int> offsets(row_offsets.begin(), row_offsets.end());
+
+  auto list_col =
+    cudf::make_lists_column(num_rows,
+                            offsets.release(),
+                            std::move(struct_col),
+                            cudf::UNKNOWN_NULL_COUNT,
+                            cudf::test::detail::make_null_mask(validity, validity + num_rows));
+
+  table_view expected({*list_col});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("lists");
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("dec64");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("dec128");
+
+  auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options in_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .use_index(false)
+      .decimal128_columns({"lists.1.dec128"});
+  auto result = cudf_io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 3bae8d7ab1e..b233819092a 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -3056,4 +3056,26 @@ TEST_F(ParquetReaderTest, EmptyOutput)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+TEST_F(ParquetWriterTest, RowGroupSizeInvalid)
+{
+  const auto unused_table = std::make_unique<table>();
+  std::vector<char> out_buffer;
+
+  EXPECT_THROW(
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
+      .row_group_size_rows(4999),
+    cudf::logic_error);
+  EXPECT_THROW(
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
+      .row_group_size_bytes(511 << 10),
+    cudf::logic_error);
+
+  EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer))
+                 .row_group_size_rows(4999),
+               cudf::logic_error);
+  EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer))
+                 .row_group_size_bytes(511 << 10),
+               cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 4ec347c4bc1..d93c1275122 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -18,8 +18,8 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/iterator.cuh>                             // include iterator header
-#include <cudf/detail/utilities/transform_unary_functions.cuh>  //for meanvar
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/transform_unary_functions.cuh>  // for meanvar
 #include <cudf/detail/utilities/vector_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -28,6 +28,7 @@
 
 #include <thrust/equal.h>
 #include <thrust/functional.h>
+#include <thrust/logical.h>
 #include <thrust/transform.h>
 
 #include <cub/device/device_reduce.cuh>
@@ -50,13 +51,8 @@ struct IteratorTest : public cudf::test::BaseFixture {
 
     // Get temporary storage size
     size_t temp_storage_bytes = 0;
-    cub::DeviceReduce::Reduce(nullptr,
-                              temp_storage_bytes,
-                              d_in,
-                              dev_result.begin(),
-                              num_items,
-                              thrust::minimum<T_output>{},
-                              init);
+    cub::DeviceReduce::Reduce(
+      nullptr, temp_storage_bytes, d_in, dev_result.begin(), num_items, thrust::minimum{}, init);
 
     // Allocate temporary storage
     rmm::device_buffer d_temp_storage(temp_storage_bytes, rmm::cuda_stream_default);
@@ -67,7 +63,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
                               d_in,
                               dev_result.begin(),
                               num_items,
-                              thrust::minimum<T_output>{},
+                              thrust::minimum{},
                               init);
 
     evaluate(expected, dev_result, "cub test");
@@ -83,7 +79,17 @@ struct IteratorTest : public cudf::test::BaseFixture {
     EXPECT_EQ(thrust::distance(d_in, d_in_last), num_items);
     auto dev_expected = cudf::detail::make_device_uvector_sync(expected);
 
-    bool result = thrust::equal(thrust::device, d_in, d_in_last, dev_expected.begin());
+    // using a temporary vector and calling transform and all_of separately is
+    // equivalent to thrust::equal but compiles ~3x faster
+    auto dev_results = rmm::device_uvector<bool>(num_items, rmm::cuda_stream_default);
+    thrust::transform(thrust::device,
+                      d_in,
+                      d_in_last,
+                      dev_expected.begin(),
+                      dev_results.begin(),
+                      thrust::equal_to{});
+    auto result = thrust::all_of(
+      thrust::device, dev_results.begin(), dev_results.end(), thrust::identity<bool>{});
     EXPECT_TRUE(result) << "thrust test";
   }
 
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index 6d51f4a5c14..a8c135a726f 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -50,21 +50,15 @@ struct transformer_optional_meanvar {
   }
 };
 
-struct sum_if_not_null {
-  template <typename T>
-  CUDA_HOST_DEVICE_CALLABLE thrust::optional<T> operator()(const thrust::optional<T>& lhs,
-                                                           const thrust::optional<T>& rhs)
-  {
-    return lhs.value_or(T{0}) + rhs.value_or(T{0});
-  }
+template <typename T>
+struct optional_to_meanvar {
+  CUDA_HOST_DEVICE_CALLABLE T operator()(const thrust::optional<T>& v) { return v.value_or(T{0}); }
 };
 
 // TODO: enable this test also at __CUDACC_DEBUG__
 // This test causes fatal compilation error only at device debug mode.
 // Workaround: exclude this test only at device debug mode.
 #if !defined(__CUDACC_DEBUG__)
-// This test computes `count`, `sum`, `sum_of_squares` at a single reduction call.
-// It would be useful for `var`, `std` operation
 TYPED_TEST(NumericOptionalIteratorTest, mean_var_output)
 {
   using T        = TypeParam;
@@ -104,22 +98,27 @@ TYPED_TEST(NumericOptionalIteratorTest, mean_var_output)
   expected_value.value_squared = std::accumulate(
     replaced_array.begin(), replaced_array.end(), T{0}, [](T acc, T i) { return acc + i * i; });
 
-  // std::cout << "expected <mixed_output> = " << expected_value << std::endl;
-
   // GPU test
   auto it_dev         = d_col->optional_begin<T>(cudf::contains_nulls::YES{});
   auto it_dev_squared = thrust::make_transform_iterator(it_dev, transformer);
-  auto result         = thrust::reduce(it_dev_squared,
-                               it_dev_squared + d_col->size(),
-                               thrust::optional<T_output>{T_output{}},
-                               sum_if_not_null{});
+
+  // this can be computed with a single reduce and without a temporary output vector
+  // but the approach increases the compile time by ~2x
+  auto results = rmm::device_uvector<T_output>(d_col->size(), rmm::cuda_stream_default);
+  thrust::transform(thrust::device,
+                    it_dev_squared,
+                    it_dev_squared + d_col->size(),
+                    results.begin(),
+                    optional_to_meanvar<T_output>{});
+  auto result = thrust::reduce(thrust::device, results.begin(), results.end(), T_output{});
+
   if (not std::is_floating_point<T>()) {
-    EXPECT_EQ(expected_value, *result) << "optional iterator reduction sum";
+    EXPECT_EQ(expected_value, result) << "optional iterator reduction sum";
   } else {
-    EXPECT_NEAR(expected_value.value, result->value, 1e-3) << "optional iterator reduction sum";
-    EXPECT_NEAR(expected_value.value_squared, result->value_squared, 1e-3)
+    EXPECT_NEAR(expected_value.value, result.value, 1e-3) << "optional iterator reduction sum";
+    EXPECT_NEAR(expected_value.value_squared, result.value_squared, 1e-3)
       << "optional iterator reduction sum squared";
-    EXPECT_EQ(expected_value.count, result->count) << "optional iterator reduction count";
+    EXPECT_EQ(expected_value.count, result.count) << "optional iterator reduction count";
   }
 }
 #endif
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index 0bc5d047612..c2cd6202dff 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -874,15 +874,15 @@ TEST_F(MergeTest, StructsNestedWithNulls)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointMerge)
+TYPED_TEST(FixedPointTestAllReps, FixedPointMerge)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index e3a7a378d35..e138cd6f68e 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -16,6 +16,8 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
@@ -27,7 +29,6 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
-#include <cudf_test/table_utilities.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -1057,12 +1058,12 @@ TYPED_TEST(ReductionTest, UniqueCount)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductZeroScale)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProductZeroScale)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -1086,7 +1087,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductZeroScale)
   EXPECT_EQ(result_fp, _24);
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProduct)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProduct)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1106,7 +1107,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProduct)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductWithNulls)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProductWithNulls)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1126,7 +1127,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductWithNulls)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSum)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSum)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1147,7 +1148,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSum)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumAlternate)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumAlternate)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -1171,7 +1172,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumAlternate)
   EXPECT_EQ(result_scalar->fixed_point_value(), TEN);
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumFractional)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumFractional)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1191,7 +1192,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumFractional)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumLarge)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumLarge)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1214,7 +1215,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumLarge)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMin)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMin)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1234,7 +1235,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMin)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMinLarge)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMinLarge)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1255,7 +1256,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMinLarge)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMax)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMax)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1275,7 +1276,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMax)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMaxLarge)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMaxLarge)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1296,7 +1297,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMaxLarge)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNUnique)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionNUnique)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1315,7 +1316,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNUnique)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumOfSquares)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumOfSquares)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1335,7 +1336,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumOfSquares)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianOddNumberOfElements)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMedianOddNumberOfElements)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1355,7 +1356,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianOddNumberOfElements)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianEvenNumberOfElements)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMedianEvenNumberOfElements)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1375,7 +1376,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianEvenNumberOfElements
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionQuantile)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionQuantile)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1397,7 +1398,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionQuantile)
   }
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNthElement)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReductionNthElement)
 {
   using namespace numeric;
   using decimalXX  = TypeParam;
@@ -1420,6 +1421,66 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNthElement)
   }
 }
 
+struct Decimal128Only : public cudf::test::BaseFixture {
+};
+
+TEST_F(Decimal128Only, Decimal128ProductReduction)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3}) {
+    auto const scale    = scale_type{i};
+    auto const column   = fp_wrapper{{2, 2, 2, 2, 2, 2, 2, 2, 2}, scale};
+    auto const expected = decimal128{scaled_integer<RepType>{512, scale_type{i * 9}}};
+
+    auto const out_type      = cudf::data_type{cudf::type_id::DECIMAL128, scale};
+    auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
+
+    EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+  }
+}
+
+TEST_F(Decimal128Only, Decimal128ProductReduction2)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3, -4, -5, -6}) {
+    auto const scale    = scale_type{i};
+    auto const column   = fp_wrapper{{1, 2, 3, 4, 5, 6}, scale};
+    auto const expected = decimal128{scaled_integer<RepType>{720, scale_type{i * 6}}};
+
+    auto const out_type      = cudf::data_type{cudf::type_id::DECIMAL128, scale};
+    auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
+
+    EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+  }
+}
+
+TEST_F(Decimal128Only, Decimal128ProductReduction3)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const values   = std::vector(127, -2);
+  auto const scale    = scale_type{0};
+  auto const column   = fp_wrapper{values.cbegin(), values.cend(), scale};
+  auto const lowest   = cuda::std::numeric_limits<RepType>::lowest();
+  auto const expected = decimal128{scaled_integer<RepType>{lowest, scale}};
+
+  auto const out_type      = cudf::data_type{cudf::type_id::DECIMAL128, scale};
+  auto const result        = cudf::reduce(column, cudf::make_product_aggregation(), out_type);
+  auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
+
+  EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+}
+
 TYPED_TEST(ReductionTest, NthElement)
 {
   using T = TypeParam;
@@ -1900,7 +1961,11 @@ struct ListReductionTest : public cudf::test::BaseFixture {
         cudf::reduce(input_data, agg, cudf::data_type(cudf::type_id::LIST));
       auto list_result = dynamic_cast<cudf::list_scalar*>(result.get());
       EXPECT_EQ(is_valid, list_result->is_valid());
-      if (is_valid) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view()); }
+      if (is_valid) {
+        CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view());
+      } else {
+        CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_value, list_result->view());
+      }
     };
 
     if (succeeded_condition) {
@@ -1986,7 +2051,7 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement)
 
   // test against empty input
   this->reduction_test(LCW{},
-                       ElementCol{{0}, {0}},  // expected_value,
+                       ElementCol{},  // expected_value,
                        true,
                        false,
                        cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
@@ -1995,7 +2060,7 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement)
 struct StructReductionTest : public cudf::test::BaseFixture {
   using SCW = cudf::test::structs_column_wrapper;
 
-  void reduction_test(SCW const& struct_column,
+  void reduction_test(cudf::column_view const& struct_column,
                       cudf::table_view const& expected_value,
                       bool succeeded_condition,
                       bool is_valid,
@@ -2006,7 +2071,7 @@ struct StructReductionTest : public cudf::test::BaseFixture {
         cudf::reduce(struct_column, agg, cudf::data_type(cudf::type_id::STRUCT));
       auto struct_result = dynamic_cast<cudf::struct_scalar*>(result.get());
       EXPECT_EQ(is_valid, struct_result->is_valid());
-      if (is_valid) { CUDF_TEST_EXPECT_TABLES_EQUAL(expected_value, struct_result->view()); }
+      if (is_valid) { CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_value, struct_result->view()); }
     };
 
     if (succeeded_condition) {
@@ -2150,4 +2215,130 @@ TEST_F(StructReductionTest, NonValidStructReductionNthElement)
                        cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
 }
 
+TEST_F(StructReductionTest, StructReductionMinMaxNoNull)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = INTS_CW{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  {
+    auto const expected_child1 = STRINGS_CW{"$1"};
+    auto const expected_child2 = INTS_CW{8};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_min_aggregation());
+  }
+
+  {
+    auto const expected_child1 = STRINGS_CW{"₹1"};
+    auto const expected_child2 = INTS_CW{3};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_max_aggregation());
+  }
+}
+
+TEST_F(StructReductionTest, StructReductionMinMaxSlicedInput)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  constexpr int32_t dont_care{1};
+
+  auto const input_original = [] {
+    auto child1 = STRINGS_CW{"$dont_care",
+                             "$dont_care",
+                             "año",
+                             "bit",
+                             "₹1",
+                             "aaa",
+                             "zit",
+                             "bat",
+                             "aab",
+                             "$1",
+                             "€1",
+                             "wut",
+                             "₹dont_care"};
+    auto child2 = INTS_CW{dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  auto const input = cudf::slice(input_original, {2, 12})[0];
+
+  {
+    auto const expected_child1 = STRINGS_CW{"$1"};
+    auto const expected_child2 = INTS_CW{8};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_min_aggregation());
+  }
+
+  {
+    auto const expected_child1 = STRINGS_CW{"₹1"};
+    auto const expected_child2 = INTS_CW{3};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_max_aggregation());
+  }
+}
+
+TEST_F(StructReductionTest, StructReductionMinMaxWithNulls)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  using cudf::test::iterators::nulls_at;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{{"año",
+                              "bit",
+                              "₹1" /*NULL*/,
+                              "aaa" /*NULL*/,
+                              "zit",
+                              "bat",
+                              "aab",
+                              "$1" /*NULL*/,
+                              "€1" /*NULL*/,
+                              "wut"},
+                             nulls_at({2, 7})};
+    auto child2 = INTS_CW{{1, 2, 3 /*NULL*/, 4 /*NULL*/, 5, 6, 7, 8 /*NULL*/, 9 /*NULL*/, 10},
+                          nulls_at({2, 7})};
+    return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+  }();
+
+  {
+    auto const expected_child1 = STRINGS_CW{"aab"};
+    auto const expected_child2 = INTS_CW{7};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_min_aggregation());
+  }
+
+  {
+    auto const expected_child1 = STRINGS_CW{"zit"};
+    auto const expected_child2 = INTS_CW{5};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_max_aggregation());
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index d1e983460d5..0892436eb47 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
@@ -397,3 +398,198 @@ TYPED_TEST(ScanDurationTest, Sum)
   EXPECT_THROW(cudf::scan(col, cudf::make_sum_aggregation(), cudf::scan_type::EXCLUSIVE),
                cudf::logic_error);
 }
+
+struct StructScanTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(StructScanTest, StructScanMinMaxNoNull)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = INTS_CW{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "año", "año", "aaa", "aaa", "aaa", "aaa", "$1", "$1", "$1"};
+      auto child2 = INTS_CW{1, 1, 1, 4, 4, 4, 4, 8, 8, 8};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "bit", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1"};
+      auto child2 = INTS_CW{1, 2, 3, 3, 3, 3, 3, 3, 3, 3};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(StructScanTest, StructScanMinMaxSlicedInput)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  constexpr int32_t dont_care{1};
+
+  auto const input_original = [] {
+    auto child1 = STRINGS_CW{"$dont_care",
+                             "$dont_care",
+                             "año",
+                             "bit",
+                             "₹1",
+                             "aaa",
+                             "zit",
+                             "bat",
+                             "aab",
+                             "$1",
+                             "€1",
+                             "wut",
+                             "₹dont_care"};
+    auto child2 = INTS_CW{dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  auto const input = cudf::slice(input_original, {2, 12})[0];
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "año", "año", "aaa", "aaa", "aaa", "aaa", "$1", "$1", "$1"};
+      auto child2 = INTS_CW{1, 1, 1, 4, 4, 4, 4, 8, 8, 8};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "bit", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1"};
+      auto child2 = INTS_CW{1, 2, 3, 3, 3, 3, 3, 3, 3, 3};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(StructScanTest, StructScanMinMaxWithNulls)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  using cudf::test::iterators::nulls_at;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{{"año",
+                              "bit",
+                              "₹1" /*NULL*/,
+                              "aaa" /*NULL*/,
+                              "zit",
+                              "bat",
+                              "aab",
+                              "$1" /*NULL*/,
+                              "€1" /*NULL*/,
+                              "wut"},
+                             nulls_at({2, 7})};
+    auto child2 = INTS_CW{{1, 2, 3 /*NULL*/, 4 /*NULL*/, 5, 6, 7, 8 /*NULL*/, 9 /*NULL*/, 10},
+                          nulls_at({2, 7})};
+    return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+  }();
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{
+        "año", "año", "año", "" /*NULL*/, "año", "año", "aab", "aab", "" /*NULL*/, "aab"};
+      auto child2 = INTS_CW{1, 1, 1, 0 /*NULL*/, 1, 1, 7, 7, 0 /*NULL*/, 7};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::EXCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{
+        "año", "bit", "bit", "" /*NULL*/, "zit", "zit", "zit", "zit", "" /*NULL*/, "zit"};
+      auto child2 = INTS_CW{1, 2, 2, 0 /*NULL*/, 5, 5, 5, 5, 0 /*NULL*/, 5};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::EXCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año",
+                               "año",
+                               "año",
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/};
+      auto child2 = INTS_CW{1,
+                            1,
+                            1,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::INCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año",
+                               "bit",
+                               "bit",
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/};
+      auto child2 = INTS_CW{1,
+                            2,
+                            2,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::INCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
diff --git a/cpp/tests/reductions/scan_tests.hpp b/cpp/tests/reductions/scan_tests.hpp
index b2c53cf5915..346103de85b 100644
--- a/cpp/tests/reductions/scan_tests.hpp
+++ b/cpp/tests/reductions/scan_tests.hpp
@@ -52,6 +52,11 @@ struct TypeParam_to_host_type<numeric::decimal64> {
   using type = numeric::decimal64::rep;
 };
 
+template <>
+struct TypeParam_to_host_type<numeric::decimal128> {
+  using type = numeric::decimal128::rep;
+};
+
 template <typename TypeParam, typename T>
 typename std::enable_if<std::is_same_v<TypeParam, cudf::string_view>,
                         thrust::host_vector<std::string>>::type
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index cfafbf26dac..92ccbbfbfd8 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -539,14 +539,14 @@ TYPED_TEST(ReplaceTest, LargeScaleReplaceTest)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointReplace)
+TYPED_TEST(FixedPointTestAllReps, FixedPointReplace)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index 2c0dd58e91e..fb8e5bdd01a 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -357,12 +357,12 @@ TEST_F(InterleaveStringsColumnsTest, MultiColumnStringMixNullableMix)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointInterleave)
+TYPED_TEST(FixedPointTestAllReps, FixedPointInterleave)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index 7d1645faba9..b335bf20f95 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -38,6 +38,7 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <limits>
+#include <type_traits>
 #include <vector>
 
 using cudf::bitmask_type;
@@ -1199,15 +1200,14 @@ TYPED_TEST(FixedPointTests, MinMaxCountLagLeadNulls)
 {
   using namespace numeric;
   using namespace cudf;
-  using decimalXX    = TypeParam;
-  using RepType      = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
-  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
-  using fw_wrapper   = cudf::test::fixed_width_column_wrapper<size_type>;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<size_type>;
 
   auto const scale              = scale_type{-1};
   auto const input              = fp_wrapper{{42, 1729, 55, 343, 1, 2}, {1, 0, 1, 0, 1, 1}, scale};
-  auto const expected_sum       = fp64_wrapper{{42, 97, 55, 56, 3, 3}, {1, 1, 1, 1, 1, 1}, scale};
+  auto const expected_sum       = fp_wrapper{{42, 97, 55, 56, 3, 3}, {1, 1, 1, 1, 1, 1}, scale};
   auto const expected_min       = fp_wrapper{{42, 42, 55, 1, 1, 1}, {1, 1, 1, 1, 1, 1}, scale};
   auto const expected_max       = fp_wrapper{{42, 55, 55, 55, 2, 2}, {1, 1, 1, 1, 1, 1}, scale};
   auto const expected_lag       = fp_wrapper{{0, 42, 1729, 55, 343, 1}, {0, 1, 0, 1, 0, 1}, scale};
diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp
index 1a9302a3e7e..6b2febb9b5c 100644
--- a/cpp/tests/round/round_tests.cpp
+++ b/cpp/tests/round/round_tests.cpp
@@ -284,6 +284,20 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfNegEven3)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(RoundTestsFixedPointTypes, TestForBlog)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const input    = fp_wrapper{{25649999}, scale_type{-5}};
+  auto const expected = fp_wrapper{{256}, scale_type{0}};
+  auto const result   = cudf::round(input);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 TYPED_TEST(RoundTestsFloatingPointTypes, SimpleFloatingPointTestHalfUp0)
 {
   using fw_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam>;
@@ -587,6 +601,54 @@ TEST_F(RoundTests, Int64AtBoundaryHalfUp)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected5, result5->view());
 }
 
+TEST_F(RoundTests, FixedPoint128HalfUp)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    auto const input    = fp_wrapper{{-160714515306}, scale_type{-13}};
+    auto const expected = fp_wrapper{{-16071451531}, scale_type{-12}};
+    auto const result   = cudf::round(input, 12, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(RoundTests, FixedPointAtBoundaryTestHalfUp)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const m = std::numeric_limits<RepType>::max();  // 170141183460469231731687303715884105727
+
+  {
+    auto const input    = fp_wrapper{{m}, scale_type{0}};
+    auto const expected = fp_wrapper{{m / 100000}, scale_type{5}};
+    auto const result   = cudf::round(input, -5, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const input    = fp_wrapper{{m}, scale_type{0}};
+    auto const expected = fp_wrapper{{m / 100000000000}, scale_type{11}};
+    auto const result   = cudf::round(input, -11, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const input    = fp_wrapper{{m}, scale_type{0}};
+    auto const expected = fp_wrapper{{m / 1000000000000000}, scale_type{15}};
+    auto const result   = cudf::round(input, -15, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
 TEST_F(RoundTests, BoolTestHalfUp)
 {
   using fw_wrapper = cudf::test::fixed_width_column_wrapper<bool>;
diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp
index 79d992005d8..41bc0af20d9 100644
--- a/cpp/tests/search/search_test.cpp
+++ b/cpp/tests/search/search_test.cpp
@@ -1817,12 +1817,12 @@ TEST_F(SearchTest, multi_contains_empty_input_set_string)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointLowerBound)
+TYPED_TEST(FixedPointTestAllReps, FixedPointLowerBound)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -1846,7 +1846,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointLowerBound)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect);
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointUpperBound)
+TYPED_TEST(FixedPointTestAllReps, FixedPointUpperBound)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp
index 94e389fc7ce..926ad1e203e 100644
--- a/cpp/tests/sort/rank_test.cpp
+++ b/cpp/tests/sort/rank_test.cpp
@@ -410,5 +410,19 @@ TYPED_TEST(Rank, min_desc_bottom_pct)
   this->run_all_tests(rank_method::MIN, desc_bottom, col1_rank, col2_rank, col3_rank, true);
 }
 
+struct RankLarge : public BaseFixture {
+};
+
+TEST_F(RankLarge, average_large)
+{
+  // testcase of https://github.com/rapidsai/cudf/issues/9703
+  auto iter = thrust::counting_iterator<int64_t>(0);
+  fixed_width_column_wrapper<int64_t> col1(iter, iter + 10558);
+  auto result =
+    cudf::rank(col1, rank_method::AVERAGE, {}, null_policy::EXCLUDE, null_order::AFTER, false);
+  fixed_width_column_wrapper<double, int> expected(iter + 1, iter + 10559);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index 54cd97301a8..af13c35acfb 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -673,14 +673,14 @@ TEST_F(SortByKey, ValueKeysSizeMismatch)
 }
 
 template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointSortedOrderGather)
+TYPED_TEST(FixedPointTestAllReps, FixedPointSortedOrderGather)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
index 813cceb0861..c80a8fba55c 100644
--- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
@@ -204,13 +204,13 @@ TEST_F(ApplyBooleanMask, FixedPointLargeColumnTest)
                   dec32_data.cend(),
                   mask_data.cbegin(),
                   std::back_inserter(expect_dec32_data),
-                  thrust::identity<bool>());
+                  thrust::identity{});
   thrust::copy_if(thrust::seq,
                   dec64_data.cbegin(),
                   dec64_data.cend(),
                   mask_data.cbegin(),
                   std::back_inserter(expect_dec64_data),
-                  thrust::identity<bool>());
+                  thrust::identity{});
 
   decimal32_wrapper expect_col32(
     expect_dec32_data.begin(), expect_dec32_data.end(), numeric::scale_type{-3});
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 3c11444e4b5..229f9e4cc82 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -302,28 +302,29 @@ TEST_F(StringsContainsTests, CountTest)
 
 TEST_F(StringsContainsTests, MultiLine)
 {
-  auto input = cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", ""});
-  auto view  = cudf::strings_column_view(input);
+  auto input =
+    cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
+  auto view = cudf::strings_column_view(input);
 
   auto results = cudf::strings::contains_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
-  auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0});
+  auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
   results           = cudf::strings::contains_re(view, "^abc$");
-  expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
+  expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
 
   results = cudf::strings::matches_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
-  auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0});
+  auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
   results          = cudf::strings::matches_re(view, "^abc$");
-  expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
+  expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
 
   results = cudf::strings::count_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
-  auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0});
+  auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
   results        = cudf::strings::count_re(view, "^abc$");
-  expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0});
+  expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 }
 
diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp
index 4543607614f..9a01d5dd041 100644
--- a/cpp/tests/strings/datetime_tests.cpp
+++ b/cpp/tests/strings/datetime_tests.cpp
@@ -311,13 +311,14 @@ TEST_F(StringsDatetimeTest, FromTimestampAmPm)
 TEST_F(StringsDatetimeTest, FromTimestampMillisecond)
 {
   cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> timestamps_ms{
-    1530705600123, 1582934461007, 1451430122421, 1318302183999, -6106017600047};
+    1530705600123, 1582934461007, 1451430122421, 1318302183999, -6106017600047, 128849018880000};
   auto results = cudf::strings::from_timestamps(timestamps_ms, "%Y-%m-%d %H:%M:%S.%3f");
   cudf::test::strings_column_wrapper expected_ms{"2018-07-04 12:00:00.123",
                                                  "2020-02-29 00:01:01.007",
                                                  "2015-12-29 23:02:02.421",
                                                  "2011-10-11 03:03:03.999",
-                                                 "1776-07-04 11:59:59.953"};
+                                                 "1776-07-04 11:59:59.953",
+                                                 "6053-01-23 02:08:00.000"};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ms);
 
   results = cudf::strings::from_timestamps(timestamps_ms, "%Y-%m-%d %H:%M:%S.%f");
@@ -325,7 +326,8 @@ TEST_F(StringsDatetimeTest, FromTimestampMillisecond)
                                                     "2020-02-29 00:01:01.007000",
                                                     "2015-12-29 23:02:02.421000",
                                                     "2011-10-11 03:03:03.999000",
-                                                    "1776-07-04 11:59:59.953000"};
+                                                    "1776-07-04 11:59:59.953000",
+                                                    "6053-01-23 02:08:00.000000"};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ms_6f);
 
   cudf::test::fixed_width_column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> timestamps_ns{
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index b614b3b49fe..5872a9e5bb7 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
+#include <limits>
 
 #include <tests/strings/utilities.h>
 
@@ -84,6 +85,76 @@ TYPED_TEST(StringsFixedPointConvertTest, ToFixedPointVeryLarge)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
+TEST_F(StringsConvertTest, ToFixedPointDecimal128)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const strings = cudf::test::strings_column_wrapper(
+    {"1234000000000000000000",
+     "-876000000000000000000",
+     "5432e+17",
+     "-12E016",
+     "250000000000000000",
+     "-2800000000000000",
+     "",
+     "-0.0",
+     "170141183460469231731687303715884105727",
+     "17014118346046923173168730371588410572700000000000000000000"});
+
+  auto const scale    = scale_type{20};
+  auto const type     = cudf::data_type{cudf::type_to_id<decimal128>(), scale};
+  auto const results  = cudf::strings::to_fixed_point(cudf::strings_column_view(strings), type);
+  auto const max      = cuda::std::numeric_limits<__int128_t>::max();
+  auto const expected = fp_wrapper{{12, -8, 5, 0, 0, 0, 0, 0, 1701411834604692317, max}, scale};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+}
+
+TEST_F(StringsConvertTest, FromFixedPointDecimal128)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto constexpr max = cuda::std::numeric_limits<__int128_t>::max();
+
+  {
+    auto const input = fp_wrapper{{110, max}, numeric::scale_type{-2}};
+    auto results     = cudf::strings::from_fixed_point(input);
+    auto const expected =
+      cudf::test::strings_column_wrapper({"1.10", "1701411834604692317316873037158841057.27"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+
+  {
+    auto const input = fp_wrapper{{max}, numeric::scale_type{-38}};
+    auto results     = cudf::strings::from_fixed_point(input);
+    auto const expected =
+      cudf::test::strings_column_wrapper({"1.70141183460469231731687303715884105727"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+
+  {
+    auto const input = fp_wrapper({110, max}, numeric::scale_type{2});
+    auto results     = cudf::strings::from_fixed_point(input);
+    auto const expected =
+      cudf::test::strings_column_wrapper({"11000", "17014118346046923173168730371588410572700"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+
+  {
+    auto const input    = fp_wrapper({-222}, numeric::scale_type{0});
+    auto results        = cudf::strings::from_fixed_point(input);
+    auto const expected = cudf::test::strings_column_wrapper({"-222"});
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+}
+
 TYPED_TEST(StringsFixedPointConvertTest, ToFixedPointVerySmall)
 {
   using DecimalType  = TypeParam;
@@ -182,46 +253,80 @@ TEST_F(StringsConvertTest, IsFixedPoint)
     cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{1}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
-  cudf::test::strings_column_wrapper big_numbers({
-    "2147483647",
-    "-2147483647",
-    "2147483648",
-    "9223372036854775807",
-    "-9223372036854775807",
-    "9223372036854775808",
-    "9223372036854775808000",
-    "100E2147483648",
-  });
+  cudf::test::strings_column_wrapper big_numbers({"2147483647",
+                                                  "-2147483647",
+                                                  "2147483648",
+                                                  "9223372036854775807",
+                                                  "-9223372036854775807",
+                                                  "9223372036854775808",
+                                                  "9223372036854775808000",
+                                                  "100E2147483648",
+                                                  "170141183460469231731687303715884105727",
+                                                  "170141183460469231731687303715884105728"});
   results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
                                           cudf::data_type{cudf::type_id::DECIMAL32});
   auto const expected32 = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, false, false, false, false, false, false});
+    {true, true, false, false, false, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32);
 
   results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
                                           cudf::data_type{cudf::type_id::DECIMAL64});
   auto const expected64 = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, true, true, true, false, false, false});
+    {true, true, true, true, true, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64);
 
+  results                = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
+                                          cudf::data_type{cudf::type_id::DECIMAL128});
+  auto const expected128 = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, true, true, true, true, false, true, false});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected128);
+
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{10}});
   auto const expected32_scaled = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, true, true, true, true, false, false});
+    {true, true, true, true, true, true, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32_scaled);
 
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{10}});
-  auto const expected64_scaled_positive =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, true, true, false});
+  auto const expected64_scaled_positive = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, true, true, true, true, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled_positive);
 
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{-5}});
   auto const expected64_scaled = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, true, false, false, false, false, false});
+    {true, true, true, false, false, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled);
 }
+
+#ifdef NDEBUG
+TEST_F(StringsConvertTest, FixedPointStringConversionOperator)
+#else
+TEST_F(StringsConvertTest, DISABLED_FixedPointStringConversionOperator)
+#endif
+{
+  auto const max = cuda::std::numeric_limits<__int128_t>::max();
+
+  auto const x = numeric::decimal128{max, numeric::scale_type{-10}};
+  EXPECT_EQ(static_cast<std::string>(x), "17014118346046923173168730371.5884105727");
+
+  auto const y = numeric::decimal128{max, numeric::scale_type{10}};
+  EXPECT_EQ(static_cast<std::string>(y), "170141183460469231731687303710000000000");
+
+  auto const z = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{10}}};
+  EXPECT_EQ(static_cast<std::string>(z), "1701411834604692317316873037158841057270000000000");
+
+  auto const a = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{40}}};
+  EXPECT_EQ(static_cast<std::string>(a),
+            "1701411834604692317316873037158841057270000000000000000000000000000000000000000");
+
+  auto const b = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{-20}}};
+  EXPECT_EQ(static_cast<std::string>(b), "1701411834604692317.31687303715884105727");
+
+  auto const c = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{-38}}};
+  EXPECT_EQ(static_cast<std::string>(c), "1.70141183460469231731687303715884105727");
+}
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index 126bffa1e49..e6f4f6bb8d9 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -58,32 +58,20 @@ TEST_F(StringsConvertTest, IsFloat)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
 
   cudf::test::strings_column_wrapper strings2(
-    {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
+    {"-34", "9.8", "1234567890", "-917.2e5", "INF", "NAN", "-Inf", "INFINITY"});
   results = cudf::strings::is_float(cudf::strings_column_view(strings2));
-  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
 }
 
 TEST_F(StringsConvertTest, ToFloats32)
 {
-  std::vector<const char*> h_strings{"1234",
-                                     nullptr,
-                                     "-876",
-                                     "543.2",
-                                     "-0.12",
-                                     ".25",
-                                     "-.002",
-                                     "",
-                                     "-0.0",
-                                     "1.2e4",
-                                     "NaN",
-                                     "abc123",
-                                     "123abc",
-                                     "456e",
-                                     "-1.78e+5",
-                                     "-122.33644782123456789",
-                                     "12e+309",
-                                     "3.4028236E38"};
+  std::vector<const char*> h_strings{
+    "1234",    nullptr,        "-876",     "543.2",
+    "-0.12",   ".25",          "-.002",    "",
+    "-0.0",    "1.2e4",        "NAN",      "abc123",
+    "123abc",  "456e",         "-1.78e+5", "-122.33644782123456789",
+    "12e+309", "3.4028236E38", "INF",      "Infinity"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -135,24 +123,11 @@ TEST_F(StringsConvertTest, FromFloats32)
 
 TEST_F(StringsConvertTest, ToFloats64)
 {
-  std::vector<const char*> h_strings{"1234",
-                                     nullptr,
-                                     "-876",
-                                     "543.2",
-                                     "-0.12",
-                                     ".25",
-                                     "-.002",
-                                     "",
-                                     "-0.0",
-                                     "1.28e256",
-                                     "NaN",
-                                     "abc123",
-                                     "123abc",
-                                     "456e",
-                                     "-1.78e+5",
-                                     "-122.33644782",
-                                     "12e+309",
-                                     "1.7976931348623159E308"};
+  std::vector<const char*> h_strings{
+    "1234",   nullptr,    "-876",     "543.2",         "-0.12",   ".25",
+    "-.002",  "",         "-0.0",     "1.28e256",      "NaN",     "abc123",
+    "123abc", "456e",     "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308",
+    "-Inf",   "-INFINITY"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index f718fbfc57b..43d63c9fd22 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -164,7 +164,6 @@ TYPED_TEST(RowBitCountTyped, ListsWithNulls)
     ((4 + 0) * CHAR_BIT) + (type_size * 0),
     ((4 + 4) * CHAR_BIT) + (type_size * 1) + 2,
     ((4 + 8) * CHAR_BIT) + (type_size * 3) + 5};
-
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
 }
 
@@ -229,7 +228,7 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   // Tests that `row_bit_count()` can handle struct<list<int32_t>> with more
   // than max_block_size (256) rows.
   // With a large number of rows, computation spills to multiple thread-blocks,
-  // thus exercising the branch-stack comptutation.
+  // thus exercising the branch-stack computation.
   // The contents of the input column aren't as pertinent to this test as the
   // column size. For what it's worth, it looks as follows:
   //   [ struct({0,1}), struct({2,3}), struct({4,5}), ... ]
@@ -240,10 +239,8 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   // List child column = {0, 1, 2, 3, 4, ..., 2*num_rows};
   auto ints      = make_numeric_column(data_type{type_id::INT32}, num_rows * 2);
   auto ints_view = ints->mutable_view();
-  thrust::tabulate(thrust::device,
-                   ints_view.begin<int32_t>(),
-                   ints_view.end<int32_t>(),
-                   thrust::identity<int32_t>());
+  thrust::tabulate(
+    thrust::device, ints_view.begin<int32_t>(), ints_view.end<int32_t>(), thrust::identity{});
 
   // List offsets = {0, 2, 4, 6, 8, ..., num_rows*2};
   auto list_offsets      = make_numeric_column(data_type{type_id::INT32}, num_rows + 1);
@@ -363,7 +360,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> build_nested_and_exp
   // Inner list column
   // clang-format off
   cudf::test::lists_column_wrapper<int> list{
-    {1, 2, 3, 4, 5},     
+    {1, 2, 3, 4, 5},
     {6, 7, 8},
     {33, 34, 35, 36, 37, 38, 39},
     {-1, -2},
@@ -409,7 +406,7 @@ std::unique_ptr<column> build_nested_column(std::vector<bool> const& struct_vali
 
   // Inner list column
   // clang-format off
-  cudf::test::lists_column_wrapper<int> list{    
+  cudf::test::lists_column_wrapper<int> list{
      {{1, 2, 3, 4, 5}, {2, 3}},
      {{6, 7, 8}, {8, 9}},
      {{1, 2}, {3, 4, 5}, {33, 34, 35, 36, 37, 38, 39}}};
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index b57ccdd816a..db457623d8d 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -784,67 +784,130 @@ TYPED_TEST(FixedPointTests, FixedPointToFixedPointSameTypeidDownPositive)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTests, FixedPointToFixedPointDifferentTypeid)
+TYPED_TEST(FixedPointTests, Decimal32ToDecimalXX)
 {
   using namespace numeric;
-  using decimalA    = TypeParam;
-  using RepTypeA    = cudf::device_storage_type_t<decimalA>;
-  using RepTypeB    = std::conditional_t<std::is_same_v<RepTypeA, int32_t>, int64_t, int32_t>;
-  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<RepTypeA>;
-  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<RepTypeB>;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int32_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal64ToDecimalXX)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int64_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal128ToDecimalXX)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = __int128_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithSmallerScale)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int32_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{172900, 1729000, 17290000, 172900000}, scale_type{-5}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-5));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal64ToDecimalXXWithSmallerScale)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int64_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const input    = fp_wrapperB{{1729, 17290, 172900, 1729000}, scale_type{-3}};
-  auto const expected = fp_wrapperA{{1729, 17290, 172900, 1729000}, scale_type{-3}};
-  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalA>(-3));
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{172900, 1729000, 17290000, 172900000}, scale_type{-5}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-5));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTests, FixedPointToFixedPointDifferentTypeidDown)
+TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithSmallerScale)
 {
   using namespace numeric;
-  using decimalA    = TypeParam;
-  using RepTypeA    = cudf::device_storage_type_t<decimalA>;
-  using RepTypeB    = std::conditional_t<std::is_same_v<RepTypeA, int32_t>, int64_t, int32_t>;
-  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<RepTypeA>;
-  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<RepTypeB>;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = __int128_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const input    = fp_wrapperB{{1729, 17290, 172900, 1729000}, scale_type{-3}};
-  auto const expected = fp_wrapperA{{172900, 1729000, 17290000, 172900000}, scale_type{-5}};
-  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalA>(-5));
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{172900, 1729000, 17290000, 172900000}, scale_type{-5}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-5));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTests, FixedPointToFixedPointDifferentTypeidUp)
+TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithLargerScale)
 {
   using namespace numeric;
-  using decimalA    = TypeParam;
-  using RepTypeA    = cudf::device_storage_type_t<decimalA>;
-  using RepTypeB    = std::conditional_t<std::is_same_v<RepTypeA, int32_t>, int64_t, int32_t>;
-  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<RepTypeA>;
-  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<RepTypeB>;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int32_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const input    = fp_wrapperB{{1729, 17290, 172900, 1729000}, scale_type{-3}};
-  auto const expected = fp_wrapperA{{1, 17, 172, 1729}, scale_type{0}};
-  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalA>(0));
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTests, FixedPointToFixedPointDifferentTypeidUpNullMask)
+TYPED_TEST(FixedPointTests, Decimal64ToDecimalXXWithLargerScale)
 {
   using namespace numeric;
-  using decimalA    = TypeParam;
-  using RepTypeA    = cudf::device_storage_type_t<decimalA>;
-  using RepTypeB    = std::conditional_t<std::is_same_v<RepTypeA, int32_t>, int64_t, int32_t>;
-  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<RepTypeA>;
-  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<RepTypeB>;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int64_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const vec      = std::vector<int32_t>{1729, 17290, 172900, 1729000};
-  auto const input    = fp_wrapperB{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
-  auto const expected = fp_wrapperA{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
-  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalA>(0));
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -858,7 +921,6 @@ TEST_F(FixedPointTestSingleType, AvoidOverflowDecimal32ToDecimal64)
   auto const input    = fp_wrapper32{{9999999}, scale_type{3}};
   auto const expected = fp_wrapper64{{9999999}, scale_type{3}};
   auto const result   = cudf::cast(input, make_fixed_point_data_type<decimal64>(3));
-
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
@@ -874,3 +936,83 @@ TEST_F(FixedPointTestSingleType, AvoidOverflowDecimal32ToInt64)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
+
+TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScale)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = __int128_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const input    = fp_wrapperFrom{{1729, 17290, 172900, 1729000}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithLargerScaleAndNullMask)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int32_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const vec      = std::vector{1729, 17290, 172900, 1729000};
+  auto const input    = fp_wrapperFrom{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal64ToDecimalXXWithLargerScaleAndNullMask)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = int64_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const vec      = std::vector{1729, 17290, 172900, 1729000};
+  auto const input    = fp_wrapperFrom{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScaleAndNullMask)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepTypeFrom    = __int128_t;
+  using RepTypeTo      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
+  using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
+
+  auto const vec      = std::vector{1729, 17290, 172900, 1729000};
+  auto const input    = fp_wrapperFrom{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
+  auto const expected = fp_wrapperTo{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TEST_F(FixedPointTestSingleType, Int32ToInt64Convert)
+{
+  using namespace numeric;
+  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<int32_t>;
+  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<int64_t>;
+
+  auto const input    = fp_wrapperB{{141230900000L}, scale_type{-10}};
+  auto const expected = fp_wrapperA{{14123}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimal32>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu
index b458f34cca8..097b786aefe 100644
--- a/cpp/tests/wrappers/timestamps_test.cu
+++ b/cpp/tests/wrappers/timestamps_test.cu
@@ -78,10 +78,9 @@ TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation)
   using namespace cudf::test;
   using namespace cuda::std::chrono;
 
-  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
-  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
-  auto chrono_col =
-    generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop_));
+  auto start      = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
+  auto stop       = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto chrono_col = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
 
   // round-trip through the host to copy `chrono_col` values
   // to a new fixed_width_column_wrapper `primitive_col`
@@ -135,14 +134,14 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
 
   auto start_lhs = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
   auto start_rhs = milliseconds(-2400000000000);  // Tue, 12 Dec 1893 05:20:00 GMT
-  auto stop_lhs_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
-  auto stop_rhs_ = milliseconds(2600000000000);   // Wed, 22 May 2052 14:13:20 GMT
+  auto stop_lhs  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto stop_rhs  = milliseconds(2600000000000);   // Wed, 22 May 2052 14:13:20 GMT
 
   auto chrono_lhs_col =
-    generate_timestamps<T>(this->size(), time_point_ms(start_lhs), time_point_ms(stop_lhs_));
+    generate_timestamps<T>(this->size(), time_point_ms(start_lhs), time_point_ms(stop_lhs));
 
   auto chrono_rhs_col =
-    generate_timestamps<T>(this->size(), time_point_ms(start_rhs), time_point_ms(stop_rhs_));
+    generate_timestamps<T>(this->size(), time_point_ms(start_rhs), time_point_ms(stop_rhs));
 
   rmm::device_uvector<int32_t> indices(this->size(), rmm::cuda_stream_default);
   thrust::sequence(rmm::exec_policy(), indices.begin(), indices.end());
diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst
index cf08d1d791b..575d7442cdf 100644
--- a/docs/cudf/source/api_docs/groupby.rst
+++ b/docs/cudf/source/api_docs/groupby.rst
@@ -59,6 +59,7 @@ Computations / descriptive stats
    GroupBy.std
    GroupBy.sum
    GroupBy.var
+   GroupBy.corr
    
 The following methods are available in both ``SeriesGroupBy`` and
 ``DataFrameGroupBy`` objects, but may differ slightly, usually in that
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
index 30269bb2a72..2a4dd5ff9c8 100644
--- a/docs/cudf/source/api_docs/index_objects.rst
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -280,6 +280,8 @@ Time-specific operations
    :toctree: api/
 
    DatetimeIndex.round
+   DatetimeIndex.ceil
+   DatetimeIndex.floor
 
 Conversion
 ~~~~~~~~~~
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 46a31a0dcf6..b90ee628332 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -300,6 +300,8 @@ Datetime methods
 
    strftime
    isocalendar
+   ceil
+   floor
 
 
 Timedelta properties
diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst
index 04c4d42fa2a..f3269768025 100644
--- a/docs/cudf/source/basics/groupby.rst
+++ b/docs/cudf/source/basics/groupby.rst
@@ -127,6 +127,13 @@ Aggregations on groups is supported via the ``agg`` method:
     a
     1   4   1  2.0
     2   5   2  4.5
+    >>> df.groupby("a").corr(method="pearson")
+              b          c
+    a                      
+    1 b  1.000000  0.866025
+      c  0.866025  1.000000
+    2 b  1.000000  1.000000
+      c  1.000000  1.000000
 
 The following table summarizes the available aggregations and the types
 that support them:
@@ -169,6 +176,9 @@ that support them:
    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
    | unique                             | ✅        | ✅         | ✅       | ✅            |        |          |            |           |
    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | corr                               | ✅        |            |          |               |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   
 
 GroupBy apply
 -------------
diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst
index 29cbc2024fc..20f3ec87ccb 100644
--- a/docs/cudf/source/basics/io-gds-integration.rst
+++ b/docs/cudf/source/basics/io-gds-integration.rst
@@ -5,17 +5,18 @@ Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations.
 GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. 
 GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. 
 The SDK is available for download `here <https://developer.nvidia.com/gpudirect-storage>`_.
+GDS is also included in CUDA Toolkit 11.4 and higher.
 
-Use of GPUDirect Storage in cuDF is disabled by default, and can be enabled through environment variable ``LIBCUDF_CUFILE_POLICY``. 
+Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``. 
 This variable also controls the GDS compatibility mode. 
 
-There are three special values for the environment variable:
+There are three valid values for the environment variable:
 
 - "GDS": Enable GDS use; GDS compatibility mode is *off*.
 - "ALWAYS": Enable GDS use; GDS compatibility mode is *on*.
-- "OFF": Compretely disable GDS use.
+- "OFF": Completely disable GDS use.
 
-Any other value (or no value set) will keep the GDS disabled for use in cuDF and IO will be done using cuDF's CPU bounce buffers.
+If no value is set, behavior will be the same as the "GDS" option.
 
 This environment variable also affects how cuDF treats GDS errors.
 When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers.
@@ -30,5 +31,3 @@ Operations that support the use of GPUDirect Storage:
 - `to_csv`
 - `to_parquet`
 - `to_orc`
-
-NOTE: current GDS integration is not fully optimized and enabling GDS will not lead to performance improvements in all cases.
diff --git a/docs/cudf/source/basics/io-nvcomp-integration.rst b/docs/cudf/source/basics/io-nvcomp-integration.rst
index af89ab5285f..521833e2afd 100644
--- a/docs/cudf/source/basics/io-nvcomp-integration.rst
+++ b/docs/cudf/source/basics/io-nvcomp-integration.rst
@@ -3,15 +3,16 @@ nvCOMP Integration
 
 Some types of compression/decompression can be performed using either `nvCOMP library <https://github.com/NVIDIA/nvcomp>`_ or the internal implementation. 
 
-Which implementation is used by default depends on the data format and the compression type. Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``.
+Which implementation is used by default depends on the data format and the compression type.
+Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``.
 
-There are three special values for the environment variable:
+There are three valid values for the environment variable:
 
 - "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use. 
 - "ALWAYS": Enable all available uses of nvCOMP, including new, experimental combinations.
 - "OFF": Disable nvCOMP use whenever possible and use the internal implementations instead.
 
-Any other value (or no value set) will result in the same behavior as the "STABLE" option.
+If no value is set, behavior will be the same as the "STABLE" option.
 
 
 .. table:: Current policy for nvCOMP use for different types
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index e596cdae5b3..df4ca853398 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -16,7 +16,7 @@
 # limitations under the License.
 #
 
-set -e
+set -ex
 gcc --version
 
 PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
@@ -58,7 +58,7 @@ make -j$PARALLEL_LEVEL
 make install DESTDIR=$INSTALL_PREFIX
 
 ###### Build cudf jar ######
-BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\" -DskipTests=$SKIP_JAVA_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL -DUSE_GDS=$ENABLE_GDS"
+BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\" -DskipTests=$SKIP_JAVA_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL -DUSE_GDS=$ENABLE_GDS -Dtest=*,!CuFileTest"
 if [ "$SIGN_FILE" == true ]; then
     # Build javadoc and sources only when SIGN_FILE is true
     BUILD_ARG="$BUILD_ARG -Prelease"
diff --git a/java/pom.xml b/java/pom.xml
index 87d43ec1272..c5a3bc64fad 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -297,9 +297,6 @@
                 <include>LICENSE</include>
               </includes>
             </resource>
-            <resource>
-                <directory>${project.build.directory}/native-deps/</directory>
-            </resource>
         </resources>
         <pluginManagement>
             <plugins>
@@ -499,14 +496,14 @@
                 <executions>
                     <execution>
                         <id>copy-native-libs</id>
-                        <phase>validate</phase>
+                        <phase>generate-resources</phase>
                         <goals>
                             <goal>copy-resources</goal>
                         </goals>
                         <configuration>
                             <overwrite>true</overwrite>
                             <skip>${skipNativeCopy}</skip>
-                            <outputDirectory>${project.build.directory}/native-deps/${os.arch}/${os.name}</outputDirectory>
+                            <outputDirectory>${project.build.outputDirectory}/${os.arch}/${os.name}</outputDirectory>
                             <resources>
                                 <resource>
                                     <directory>${native.build.path}</directory>
diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
index 68213c21956..2b1afb92e36 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
@@ -80,25 +80,22 @@ static DType implicitConversion(BinaryOp op, BinaryOperable lhs, BinaryOperable
       return DType.BOOL8;
     }
     if (a.isDecimalType() && b.isDecimalType()) {
-      // Here scale is created with value 0 as `scale` is required to create DType of
-      // decimal type. Dtype is discarded for binary operations for decimal types in cudf as a new
-      // DType is created for output type with new scale. New scale for output depends upon operator.
-      int scale = 0;
-      if (a.typeId == DType.DTypeEnum.DECIMAL32) {
-        if (b.typeId == DType.DTypeEnum.DECIMAL32) {
-          return DType.create(DType.DTypeEnum.DECIMAL32,
-              ColumnView.getFixedPointOutputScale(op, lhs.getType(), rhs.getType()));
-        } else {
-          throw new IllegalArgumentException("Both columns must be of the same fixed_point type");
-        }
-      } else if (a.typeId == DType.DTypeEnum.DECIMAL64) {
-        if (b.typeId == DType.DTypeEnum.DECIMAL64) {
-          return DType.create(DType.DTypeEnum.DECIMAL64,
-              ColumnView.getFixedPointOutputScale(op, lhs.getType(), rhs.getType()));
-        } else {
-          throw new IllegalArgumentException("Both columns must be of the same fixed_point type");
-        }
+      if (a.typeId != b.typeId) {
+        throw new IllegalArgumentException("Both columns must be of the same fixed_point type");
       }
+      final int scale = ColumnView.getFixedPointOutputScale(op, lhs.getType(), rhs.getType());
+      // The output precision/size should be at least as large as the input.
+      // It may be larger if room is needed for it based off of the output scale.
+      final DType.DTypeEnum outputEnum;
+      if (scale <= DType.DECIMAL32_MAX_PRECISION && a.typeId == DType.DTypeEnum.DECIMAL32) {
+        outputEnum = DType.DTypeEnum.DECIMAL32;
+      } else if (scale <= DType.DECIMAL64_MAX_PRECISION &&
+          (a.typeId == DType.DTypeEnum.DECIMAL32 || a.typeId == DType.DTypeEnum.DECIMAL64)) {
+        outputEnum = DType.DTypeEnum.DECIMAL64;
+      } else {
+        outputEnum = DType.DTypeEnum.DECIMAL128;
+      }
+      return DType.create(outputEnum, scale);
     }
     throw new IllegalArgumentException("Unsupported types " + a + " and " + b);
   }
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 7eb51a52a7d..3fed6316215 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -24,6 +24,7 @@
 import org.slf4j.LoggerFactory;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
@@ -1391,6 +1392,18 @@ public static ColumnVector decimalFromDoubles(DType type, RoundingMode mode, dou
     }
   }
 
+
+  /**
+   * Create a new decimal vector from BigIntegers
+   * Compared with scale of [[java.math.BigDecimal]], the scale here represents the opposite meaning.
+   */
+  public static ColumnVector decimalFromBigInt(int scale, BigInteger... values) {
+    try (HostColumnVector host = HostColumnVector.decimalFromBigIntegers(scale, values)) {
+      ColumnVector columnVector = host.copyToDevice();
+      return columnVector;
+    }
+  }
+
   /**
    * Create a new string vector from the given values.  This API
    * supports inline nulls. This is really intended to be used only for testing as
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 329c251f72d..6d0d24baf99 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -30,6 +30,10 @@
  */
 public class ColumnView implements AutoCloseable, BinaryOperable {
 
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
   public static final long UNKNOWN_NULL_COUNT = -1;
 
   protected long viewHandle;
@@ -398,12 +402,13 @@ public final ColumnVector isInteger() {
    * for null entries.
    *
    * @param intType the data type that should be used for bounds checking. Note that only
-   *                integer types are allowed.
+   *                cudf integer types are allowed including signed/unsigned int8 through int64
    * @return Boolean vector
    */
   public final ColumnVector isInteger(DType intType) {
     assert type.equals(DType.STRING);
-    assert intType.isBackedByInt() || intType.isBackedByLong();
+    assert intType.isBackedByInt() || intType.isBackedByLong() || intType.isBackedByByte()
+        || intType.isBackedByShort();
     return new ColumnVector(isIntegerWithType(getNativeView(),
         intType.getTypeId().getNativeId(), intType.getScale()));
   }
diff --git a/java/src/main/java/ai/rapids/cudf/DType.java b/java/src/main/java/ai/rapids/cudf/DType.java
index 2d851aa2ae3..742501be375 100644
--- a/java/src/main/java/ai/rapids/cudf/DType.java
+++ b/java/src/main/java/ai/rapids/cudf/DType.java
@@ -23,6 +23,7 @@ public final class DType {
 
   public static final int DECIMAL32_MAX_PRECISION = 9;
   public static final int DECIMAL64_MAX_PRECISION = 18;
+  public static final int DECIMAL128_MAX_PRECISION = 38;
 
   /* enum representing various types. Whenever a new non-decimal type is added please make sure
   below sections are updated as well:
@@ -77,7 +78,8 @@ public enum DTypeEnum {
     LIST(0, 24),
     DECIMAL32(4, 25),
     DECIMAL64(8, 26),
-    STRUCT(0, 27);
+    DECIMAL128(16, 27),
+    STRUCT(0, 28);
 
     final int sizeInBytes;
     final int nativeId;
@@ -167,6 +169,7 @@ private DType(DTypeEnum id, int decimalScale) {
       LIST,
       null, // DECIMAL32
       null, // DECIMAL64
+      null, // DECIMAL128
       STRUCT
   };
 
@@ -276,6 +279,13 @@ public static DType fromNative(int nativeId, int scale) {
         }
         return new DType(DTypeEnum.DECIMAL64, scale);
       }
+      if (nativeId == DTypeEnum.DECIMAL128.nativeId) {
+        if (-scale > DECIMAL128_MAX_PRECISION) {
+          throw new IllegalArgumentException(
+              "Scale " + (-scale) + " exceeds DECIMAL128_MAX_PRECISION " + DECIMAL128_MAX_PRECISION);
+        }
+        return new DType(DTypeEnum.DECIMAL128, scale);
+      }
     }
     throw new IllegalArgumentException("Could not translate " + nativeId + " into a DType");
   }
@@ -293,6 +303,8 @@ public static DType fromJavaBigDecimal(BigDecimal dec) {
       return new DType(DTypeEnum.DECIMAL32, -dec.scale());
     } else if (dec.precision() <= DECIMAL64_MAX_PRECISION) {
       return new DType(DTypeEnum.DECIMAL64, -dec.scale());
+    } else if (dec.precision() <= DECIMAL128_MAX_PRECISION) {
+      return new DType(DTypeEnum.DECIMAL128, -dec.scale());
     }
     throw new IllegalArgumentException("Precision " + dec.precision() +
         " exceeds max precision cuDF can support " + DECIMAL64_MAX_PRECISION);
@@ -450,7 +462,8 @@ public boolean hasOffsets() {
 
   private static final EnumSet<DTypeEnum> DECIMALS = EnumSet.of(
       DTypeEnum.DECIMAL32,
-      DTypeEnum.DECIMAL64
+      DTypeEnum.DECIMAL64,
+      DTypeEnum.DECIMAL128
   );
 
   private static final EnumSet<DTypeEnum> NESTED_TYPE = EnumSet.of(
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index 46255428c1c..e21a4ac81c6 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -525,6 +525,23 @@ public static HostColumnVector decimalFromBoxedLongs(int scale, Long... values)
     });
   }
 
+  /**
+   * Create a new decimal vector from unscaled values (BigInteger array) and scale.
+   * The created vector is of type DType.DECIMAL128.
+   * Compared with scale of [[java.math.BigDecimal]], the scale here represents the opposite meaning.
+   */
+  public static HostColumnVector decimalFromBigIntegers(int scale, BigInteger... values) {
+    return build(DType.create(DType.DTypeEnum.DECIMAL128, scale), values.length, (b) -> {
+      for (BigInteger v : values) {
+        if (v == null) {
+          b.appendNull();
+        } else {
+          b.appendUnscaledDecimal(v);
+        }
+      }
+    });
+  }
+
   /**
    * Create a new decimal vector from double floats with specific DecimalType and RoundingMode.
    * All doubles will be rescaled if necessary, according to scale of input DecimalType and RoundingMode.
@@ -1222,7 +1239,12 @@ public final ColumnBuilder append(BigDecimal value) {
         data.setInt(currentIndex * type.getSizeInBytes(), unscaledVal.intValueExact());
       } else if (type.typeId == DType.DTypeEnum.DECIMAL64) {
         data.setLong(currentIndex * type.getSizeInBytes(), unscaledVal.longValueExact());
-      } else {
+      } else if (type.typeId == DType.DTypeEnum.DECIMAL128) {
+        assert currentIndex < rows;
+        byte[] unscaledValueBytes = value.unscaledValue().toByteArray();
+        byte[] result = convertDecimal128FromJavaToCudf(unscaledValueBytes);
+        data.setBytes(currentIndex*DType.DTypeEnum.DECIMAL128.sizeInBytes, result, 0, result.length);
+      }  else {
         throw new IllegalStateException(type + " is not a supported decimal type.");
       }
       currentIndex++;
@@ -1450,7 +1472,7 @@ public final Builder append(BigDecimal value) {
      */
     public final Builder append(BigDecimal value, RoundingMode roundingMode) {
       assert type.isDecimalType();
-      assert currentIndex < rows;
+      assert currentIndex < rows: "appended too many values " + currentIndex + " out of total rows " + rows;
       BigInteger unscaledValue = value.setScale(-type.getScale(), roundingMode).unscaledValue();
       if (type.typeId == DType.DTypeEnum.DECIMAL32) {
         assert value.precision() <= DType.DECIMAL32_MAX_PRECISION : "value exceeds maximum precision for DECIMAL32";
@@ -1458,6 +1480,10 @@ public final Builder append(BigDecimal value, RoundingMode roundingMode) {
       } else if (type.typeId == DType.DTypeEnum.DECIMAL64) {
         assert value.precision() <= DType.DECIMAL64_MAX_PRECISION : "value exceeds maximum precision for DECIMAL64 ";
         data.setLong(currentIndex * type.getSizeInBytes(), unscaledValue.longValueExact());
+      } else if (type.typeId == DType.DTypeEnum.DECIMAL128) {
+        assert value.precision() <= DType.DECIMAL128_MAX_PRECISION : "value exceeds maximum precision for DECIMAL128 ";
+        appendUnscaledDecimal(value.unscaledValue());
+        return this;
       } else {
         throw new IllegalStateException(type + " is not a supported decimal type.");
       }
@@ -1481,6 +1507,16 @@ public final Builder appendUnscaledDecimal(long value) {
       return this;
     }
 
+    public final Builder appendUnscaledDecimal(BigInteger value) {
+      assert type.typeId == DType.DTypeEnum.DECIMAL128;
+      assert currentIndex < rows;
+      byte[] unscaledValueBytes = value.toByteArray();
+      byte[] result = convertDecimal128FromJavaToCudf(unscaledValueBytes);
+      data.setBytes(currentIndex*DType.DTypeEnum.DECIMAL128.sizeInBytes, result, 0, result.length);
+      currentIndex++;
+      return this;
+    }
+
     public Builder append(String value) {
       assert value != null : "appendNull must be used to append null strings";
       return appendUTF8String(value.getBytes(StandardCharsets.UTF_8));
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
index e4fb71033af..dd07df16553 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
@@ -22,6 +22,8 @@
 import org.slf4j.LoggerFactory;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteOrder;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
@@ -341,6 +343,13 @@ public final BigDecimal getBigDecimal(long index) {
     } else if (type.typeId == DType.DTypeEnum.DECIMAL64) {
       long unscaledValue = offHeap.data.getLong(index * type.getSizeInBytes());
       return BigDecimal.valueOf(unscaledValue, -type.getScale());
+    } else if (type.typeId == DType.DTypeEnum.DECIMAL128) {
+      int sizeInBytes = DType.DTypeEnum.DECIMAL128.sizeInBytes;
+      byte[] dst = new byte[sizeInBytes];
+      // We need to switch the endianness for decimal128 byte arrays between java and native code.
+      offHeap.data.getBytes(dst, 0, (index * sizeInBytes), sizeInBytes);
+      convertInPlaceToBigEndian(dst);
+      return new BigDecimal(new BigInteger(dst), -type.getScale());
     } else {
       throw new IllegalStateException(type + " is not a supported decimal type.");
     }
@@ -534,6 +543,34 @@ public String toString() {
         '}';
   }
 
+  protected static byte[] convertDecimal128FromJavaToCudf(byte[] bytes) {
+    byte[] finalBytes = new byte[DType.DTypeEnum.DECIMAL128.sizeInBytes];
+    byte lastByte = bytes[0];
+    //Convert to 2's complement representation and make sure the sign bit is extended correctly
+    byte setByte = (lastByte & 0x80) > 0 ? (byte)0xff : (byte)0x00;
+    for(int i = bytes.length; i < finalBytes.length; i++) {
+      finalBytes[i] = setByte;
+    }
+    // After setting the sign bits, reverse the rest of the bytes for endianness
+    for(int k = 0; k < bytes.length; k++) {
+      finalBytes[k] = bytes[bytes.length - k - 1];
+    }
+    return finalBytes;
+  }
+
+  private void convertInPlaceToBigEndian(byte[] dst) {
+    assert ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN);
+    int i =0;
+    int j = dst.length -1;
+    while (j > i) {
+      byte tmp;
+      tmp = dst[j];
+      dst[j] = dst[i];
+      dst[i] = tmp;
+      j--;
+      i++;
+    }
+  }
   /////////////////////////////////////////////////////////////////////////////
   // HELPER CLASSES
   /////////////////////////////////////////////////////////////////////////////
@@ -557,15 +594,9 @@ protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) {
       boolean neededCleanup = false;
       if (data != null || valid != null || offsets != null) {
         try {
-          if (data != null) {
-            data.close();
-          }
-          if (offsets != null) {
-            offsets.close();
-          }
-          if (valid != null) {
-            valid.close();
-          }
+          ColumnVector.closeBuffers(data);
+          ColumnVector.closeBuffers(offsets);
+          ColumnVector.closeBuffers(valid);
         } finally {
           // Always mark the resource as freed even if an exception is thrown.
           // We cannot know how far it progressed before the exception, and
diff --git a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
index 8780ecc3aa3..9663fbcafb4 100755
--- a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
+++ b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
@@ -81,9 +81,7 @@ public static synchronized void loadNativeDeps() {
 
   /**
    * Allows other libraries to reuse the same native deps loading logic. Libraries will be searched
-   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class. It
-   * will also look for the libraries under ./target/native-deps/${os.arch}/${os.name} to help
-   * facilitate testing while building.
+   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class.
    * <br/>
    * Because this just loads the libraries and loading the libraries themselves needs to be a
    * singleton operation it is recommended that any library using this provide their own wrapper
@@ -203,12 +201,7 @@ private static File createFile(String os, String arch, String baseName) throws I
     File loc;
     URL resource = loader.getResource(path);
     if (resource == null) {
-      // It looks like we are not running from the jar, or there are issues with the jar
-      File f = new File("./target/native-deps/" + path);
-      if (!f.exists()) {
-        throw new FileNotFoundException("Could not locate native dependency " + path);
-      }
-      resource = f.toURI().toURL();
+      throw new FileNotFoundException("Could not locate native dependency " + path);
     }
     try (InputStream in = resource.openStream()) {
       loc = File.createTempFile(baseName, ".so");
diff --git a/java/src/main/java/ai/rapids/cudf/ORCOptions.java b/java/src/main/java/ai/rapids/cudf/ORCOptions.java
index 359a6b96628..2ff253060f0 100644
--- a/java/src/main/java/ai/rapids/cudf/ORCOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ORCOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -18,6 +18,10 @@
 
 package ai.rapids.cudf;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
 /**
  * Options for reading a ORC file
  */
@@ -27,9 +31,11 @@ public class ORCOptions extends ColumnFilterOptions {
 
   private final boolean useNumPyTypes;
   private final DType unit;
+  private final String[] decimal128Columns;
 
   private ORCOptions(Builder builder) {
     super(builder);
+    decimal128Columns = builder.decimal128Columns.toArray(new String[0]);
     useNumPyTypes = builder.useNumPyTypes;
     unit = builder.unit;
   }
@@ -42,6 +48,10 @@ DType timeUnit() {
     return unit;
   }
 
+  String[] getDecimal128Columns() {
+    return decimal128Columns;
+  }
+
   public static Builder builder() {
     return new Builder();
   }
@@ -50,6 +60,8 @@ public static class Builder extends ColumnFilterOptions.Builder<Builder> {
     private boolean useNumPyTypes = true;
     private DType unit = DType.EMPTY;
 
+    final List<String> decimal128Columns = new ArrayList<>();
+
     /**
      * Specify whether the parser should implicitly promote TIMESTAMP_DAYS
      * columns to TIMESTAMP_MILLISECONDS for compatibility with NumPy.
@@ -73,6 +85,23 @@ public ORCOptions.Builder withTimeUnit(DType unit) {
       return this;
     }
 
+    /**
+     * Specify decimal columns which shall be read as DECIMAL128. Otherwise, decimal columns
+     * will be read as DECIMAL64 by default in ORC.
+     *
+     * In terms of child columns of nested types, their parents need to be prepended as prefix
+     * of the column name, in case of ambiguity. For struct columns, the names of child columns
+     * are formatted as `{struct_col_name}.{child_col_name}`. For list columns, the data(child)
+     * columns are named as `{list_col_name}.1`.
+     *
+     * @param names names of columns which read as DECIMAL128
+     * @return builder for chaining
+     */
+    public Builder decimal128Column(String... names) {
+      decimal128Columns.addAll(Arrays.asList(names));
+      return this;
+    }
+
     public ORCOptions build() { return new ORCOptions(this); }
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Scalar.java b/java/src/main/java/ai/rapids/cudf/Scalar.java
index 631f091005a..03e77573695 100644
--- a/java/src/main/java/ai/rapids/cudf/Scalar.java
+++ b/java/src/main/java/ai/rapids/cudf/Scalar.java
@@ -22,6 +22,8 @@
 import org.slf4j.LoggerFactory;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteOrder;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.List;
@@ -86,6 +88,8 @@ public static Scalar fromNull(DType type) {
       return new Scalar(type, makeDecimal32Scalar(0, type.getScale(), false));
     case DECIMAL64:
       return new Scalar(type, makeDecimal64Scalar(0L, type.getScale(), false));
+    case DECIMAL128:
+      return new Scalar(type, makeDecimal128Scalar(BigInteger.ZERO.toByteArray(), type.getScale(), false));
     case LIST:
       throw new IllegalArgumentException("Please call 'listFromNull' to create a null list scalar.");
     default:
@@ -227,6 +231,13 @@ public static Scalar fromDecimal(int scale, long unscaledValue) {
     return new Scalar(DType.create(DType.DTypeEnum.DECIMAL64, scale), handle);
   }
 
+  public static Scalar fromDecimal(int scale, BigInteger unscaledValue) {
+    byte[] unscaledValueBytes = unscaledValue.toByteArray();
+    byte[] finalBytes =  convertDecimal128FromJavaToCudf(unscaledValueBytes);
+    long handle = makeDecimal128Scalar(finalBytes, scale, true);
+    return new Scalar(DType.create(DType.DTypeEnum.DECIMAL128, scale), handle);
+  }
+
   public static Scalar fromFloat(Float value) {
     if (value == null) {
       return Scalar.fromNull(DType.FLOAT32);
@@ -253,8 +264,12 @@ public static Scalar fromDecimal(BigDecimal value) {
     long handle;
     if (dt.typeId == DType.DTypeEnum.DECIMAL32) {
       handle = makeDecimal32Scalar(value.unscaledValue().intValueExact(), -value.scale(), true);
-    } else {
+    } else if (dt.typeId == DType.DTypeEnum.DECIMAL64) {
       handle = makeDecimal64Scalar(value.unscaledValue().longValueExact(), -value.scale(), true);
+    } else {
+      byte[] unscaledValueBytes = value.unscaledValue().toByteArray();
+      byte[] finalBytes =  convertDecimal128FromJavaToCudf(unscaledValueBytes);
+      handle = makeDecimal128Scalar(finalBytes, -value.scale(), true);
     }
     return new Scalar(dt, handle);
   }
@@ -470,6 +485,7 @@ private static ColumnVector buildNullColumnVector(HostColumnVector.DataType host
   private static native short getShort(long scalarHandle);
   private static native int getInt(long scalarHandle);
   private static native long getLong(long scalarHandle);
+  private static native byte[] getBigIntegerBytes(long scalarHandle);
   private static native float getFloat(long scalarHandle);
   private static native double getDouble(long scalarHandle);
   private static native byte[] getUTF8(long scalarHandle);
@@ -493,6 +509,7 @@ private static ColumnVector buildNullColumnVector(HostColumnVector.DataType host
   private static native long makeTimestampTimeScalar(int dtypeNativeId, long value, boolean isValid);
   private static native long makeDecimal32Scalar(int value, int scale, boolean isValid);
   private static native long makeDecimal64Scalar(long value, int scale, boolean isValid);
+  private static native long makeDecimal128Scalar(byte[] value, int scale, boolean isValid);
   private static native long makeListScalar(long viewHandle, boolean isValid);
   private static native long makeStructScalar(long[] viewHandles, boolean isValid);
   private static native long repeatString(long scalarHandle, int repeatTimes);
@@ -579,6 +596,15 @@ public long getLong() {
     return getLong(getScalarHandle());
   }
 
+  /**
+   * Returns the BigDecimal unscaled scalar value as a byte array.
+   */
+  public byte[] getBigInteger() {
+    byte[] res = getBigIntegerBytes(getScalarHandle());
+    convertInPlaceToBigEndian(res);
+    return res;
+  }
+
   /**
    * Returns the scalar value as a float.
    */
@@ -601,6 +627,8 @@ public BigDecimal getBigDecimal() {
       return BigDecimal.valueOf(getInt(), -type.getScale());
     } else if (this.type.typeId == DType.DTypeEnum.DECIMAL64) {
       return BigDecimal.valueOf(getLong(), -type.getScale());
+    } else if (this.type.typeId == DType.DTypeEnum.DECIMAL128) {
+      return new BigDecimal(new BigInteger(getBigInteger()), -type.getScale());
     }
     throw new IllegalArgumentException("Couldn't getBigDecimal from nonDecimal scalar");
   }
@@ -844,6 +872,8 @@ public String toString() {
       case DECIMAL32:
         // FALL THROUGH
       case DECIMAL64:
+        // FALL THROUGH
+      case DECIMAL128:
         sb.append(getBigDecimal());
         break;
       case LIST:
@@ -879,6 +909,35 @@ public Scalar repeatString(int repeatTimes) {
     return new Scalar(DType.STRING, repeatString(getScalarHandle(), repeatTimes));
   }
 
+  private static byte[] convertDecimal128FromJavaToCudf(byte[] bytes) {
+    byte[] finalBytes = new byte[DType.DTypeEnum.DECIMAL128.sizeInBytes];
+    byte lastByte = bytes[0];
+    //Convert to 2's complement representation and make sure the sign bit is extended correctly
+    byte setByte = (lastByte & 0x80) > 0 ? (byte)0xff : (byte)0x00;
+    for(int i = bytes.length; i < finalBytes.length; i++) {
+      finalBytes[i] = setByte;
+    }
+    // After setting the sign bits, reverse the rest of the bytes for endianness
+    for(int k = 0; k < bytes.length; k++) {
+      finalBytes[k] = bytes[bytes.length - k - 1];
+    }
+    return finalBytes;
+  }
+
+  private void convertInPlaceToBigEndian(byte[] res) {
+    assert ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN);
+    int i =0;
+    int j = res.length -1;
+    while (j > i) {
+      byte tmp;
+      tmp = res[j];
+      res[j] = res[i];
+      res[i] = tmp;
+      j--;
+      i++;
+    }
+  }
+
   /**
    * Holds the off-heap state of the scalar so it can be cleaned up, even if it is leaked.
    */
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 68e7a21988a..b11808ed023 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -27,6 +27,8 @@
 
 import java.io.File;
 import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.math.MathContext;
 import java.math.RoundingMode;
 import java.nio.ByteBuffer;
 import java.util.*;
@@ -329,10 +331,12 @@ private static native long writeParquetBufferBegin(String[] columnNames,
    * @param usingNumPyTypes   whether the parser should implicitly promote TIMESTAMP
    *                          columns to TIMESTAMP_MILLISECONDS for compatibility with NumPy.
    * @param timeUnit          return type of TimeStamp in units
+   * @param decimal128Columns name of the columns which are read as Decimal128 rather than Decimal64
    */
   private static native long[] readORC(String[] filterColumnNames,
                                        String filePath, long address, long length,
-                                       boolean usingNumPyTypes, int timeUnit) throws CudfException;
+                                       boolean usingNumPyTypes, int timeUnit,
+                                       String[] decimal128Columns) throws CudfException;
 
   /**
    * Setup everything to write ORC formatted data to a file.
@@ -674,6 +678,8 @@ private static native ContiguousTable[] contiguousSplitGroups(long inputTable,
                                                                 boolean[] keysDescending,
                                                                 boolean[] keysNullSmallest);
 
+  private static native long[] sample(long tableHandle, long n, boolean replacement, long seed);
+
   /////////////////////////////////////////////////////////////////////////////
   // TABLE CREATION APIs
   /////////////////////////////////////////////////////////////////////////////
@@ -881,7 +887,9 @@ public static Table readORC(File path) {
    */
   public static Table readORC(ORCOptions opts, File path) {
     return new Table(readORC(opts.getIncludeColumnNames(),
-        path.getAbsolutePath(), 0, 0, opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId()));
+        path.getAbsolutePath(), 0, 0,
+        opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+        opts.getDecimal128Columns()));
   }
 
   /**
@@ -941,8 +949,9 @@ public static Table readORC(ORCOptions opts, HostMemoryBuffer buffer,
     assert len <= buffer.getLength() - offset;
     assert offset >= 0 && offset < buffer.length;
     return new Table(readORC(opts.getIncludeColumnNames(),
-        null, buffer.getAddress() + offset, len, opts.usingNumPyTypes(),
-        opts.timeUnit().typeId.getNativeId()));
+        null, buffer.getAddress() + offset, len,
+        opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+        opts.getDecimal128Columns()));
   }
 
   private static class ParquetTableWriter implements TableWriter {
@@ -2794,6 +2803,34 @@ public static Table fromPackedTable(ByteBuffer metadata, DeviceMemoryBuffer data
     return result;
   }
 
+
+  /**
+   * Gather `n` samples from table randomly
+   * Note: does not preserve the ordering
+   * Example:
+   * input: {col1: {1, 2, 3, 4, 5}, col2: {6, 7, 8, 9, 10}}
+   * n: 3
+   * replacement: false
+   *
+   * output:       {col1: {3, 1, 4}, col2: {8, 6, 9}}
+   *
+   * replacement: true
+   *
+   * output:       {col1: {3, 1, 1}, col2: {8, 6, 6}}
+   *
+   * throws "logic_error" if `n` > table rows and `replacement` == FALSE.
+   * throws "logic_error" if `n` < 0.
+   *
+   * @param n non-negative number of samples expected from table
+   * @param replacement Allow or disallow sampling of the same row more than once.
+   * @param seed Seed value to initiate random number generator.
+   *
+   * @return Table containing samples
+   */
+  public Table sample(long n, boolean replacement, long seed) {
+    return new Table(sample(nativeHandle, n, replacement, seed));
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // HELPER CLASSES
   /////////////////////////////////////////////////////////////////////////////
@@ -3808,6 +3845,16 @@ public TestBuilder decimal64Column(int scale, RoundingMode mode, String... value
       return this;
     }
 
+    public TestBuilder decimal128Column(int scale, RoundingMode mode, BigInteger... values) {
+      types.add(new BasicType(true, DType.create(DType.DTypeEnum.DECIMAL128, scale)));
+      BigDecimal[] data = Arrays.stream(values).map((x) -> {
+        if (x == null) return null;
+        return new BigDecimal(x, scale, new MathContext(38, mode));
+      }).toArray(BigDecimal[]::new);
+      typeErasedData.add(data);
+      return this;
+    }
+
     private static ColumnVector from(DType type, Object dataArray) {
       ColumnVector ret = null;
       switch (type.typeId) {
@@ -3852,6 +3899,7 @@ private static ColumnVector from(DType type, Object dataArray) {
           break;
         case DECIMAL32:
         case DECIMAL64:
+        case DECIMAL128:
           int scale = type.getScale();
           if (dataArray instanceof Integer[]) {
             BigDecimal[] data = Arrays.stream(((Integer[]) dataArray))
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index bce330ea4a3..02d5dc4569c 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -916,7 +916,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
         case cudf::type_id::INT64:
         case cudf::type_id::UINT64: result = cudf::strings::from_integers(*column); break;
         case cudf::type_id::DECIMAL32:
-        case cudf::type_id::DECIMAL64: result = cudf::strings::from_fixed_point(*column); break;
+        case cudf::type_id::DECIMAL64:
+        case cudf::type_id::DECIMAL128: result = cudf::strings::from_fixed_point(*column); break;
         default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
       }
     } else if (column->type().id() == cudf::type_id::STRING) {
@@ -936,6 +937,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
           break;
         case cudf::type_id::DECIMAL32:
         case cudf::type_id::DECIMAL64:
+        case cudf::type_id::DECIMAL128:
           result = cudf::strings::to_fixed_point(*column, n_data_type);
           break;
         default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
diff --git a/java/src/main/native/src/ScalarJni.cpp b/java/src/main/native/src/ScalarJni.cpp
index fb4f14fdb80..b00b066742a 100644
--- a/java/src/main/native/src/ScalarJni.cpp
+++ b/java/src/main/native/src/ScalarJni.cpp
@@ -109,6 +109,20 @@ JNIEXPORT jdouble JNICALL Java_ai_rapids_cudf_Scalar_getDouble(JNIEnv *env, jcla
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getBigIntegerBytes(JNIEnv *env, jclass,
+                                                                           jlong scalar_handle) {
+  try {
+    cudf::jni::auto_set_device(env);
+    using ScalarType = cudf::scalar_type_t<__int128_t>;
+    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto val = s->value();
+    jbyte const *ptr = reinterpret_cast<jbyte const *>(&val);
+    cudf::jni::native_jbyteArray jbytes{env, ptr, sizeof(__int128_t)};
+    return jbytes.get_jArray();
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getUTF8(JNIEnv *env, jclass,
                                                                 jlong scalar_handle) {
   try {
@@ -455,6 +469,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal64Scalar(JNIEnv *e
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal128Scalar(JNIEnv *env, jclass,
+                                                                        jbyteArray value,
+                                                                        jint scale,
+                                                                        jboolean is_valid) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const scale_ = numeric::scale_type{static_cast<int32_t>(scale)};
+    cudf::jni::native_jbyteArray jbytes{env, value};
+    auto const value_ = reinterpret_cast<__int128_t *>(jbytes.data());
+    std::unique_ptr<cudf::scalar> s =
+        cudf::make_fixed_point_scalar<numeric::decimal128>(*value_, scale_);
+    s->set_valid_async(is_valid);
+    return reinterpret_cast<jlong>(s.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclass, jlong lhs_ptr,
                                                               jlong rhs_view, jint int_op,
                                                               jint out_dtype, jint scale) {
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index c66cf13a5ae..f3377bb002d 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -20,6 +20,7 @@
 #include <arrow/ipc/api.h>
 #include <cudf/aggregation.hpp>
 #include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/hashing.hpp>
@@ -1455,7 +1456,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jc
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
     JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer,
-    jlong buffer_length, jboolean usingNumPyTypes, jint unit) {
+    jlong buffer_length, jboolean usingNumPyTypes, jint unit, jobjectArray dec128_col_names) {
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
@@ -1478,6 +1479,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
+    cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
+
     std::unique_ptr<cudf::io::source_info> source;
     if (read_buffer) {
       source.reset(new cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length));
@@ -1491,6 +1494,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
             .use_index(false)
             .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+            .decimal128_columns(n_dec128_col_names.as_cpp_vector())
             .build();
     cudf::io::table_with_metadata result = cudf::io::read_orc(opts);
     return cudf::jni::convert_table_for_return(env, result.tbl);
@@ -3144,4 +3148,18 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(JNIEnv *env, jclass, jlong j_input,
+                                                              jlong n, jboolean replacement,
+                                                              jlong seed) {
+  JNI_NULL_CHECK(env, j_input, "input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(j_input);
+    auto sample_with_replacement =
+        replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE;
+    std::unique_ptr<cudf::table> result = cudf::sample(*input, n, sample_with_replacement, seed);
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
 } // extern "C"
diff --git a/java/src/main/native/src/dtype_utils.hpp b/java/src/main/native/src/dtype_utils.hpp
index 9fae0c585e6..53108ee7268 100644
--- a/java/src/main/native/src/dtype_utils.hpp
+++ b/java/src/main/native/src/dtype_utils.hpp
@@ -45,7 +45,8 @@ inline cudf::data_type timestamp_to_duration(cudf::data_type dt) {
 }
 
 inline bool is_decimal_type(cudf::type_id n_type) {
-  return n_type == cudf::type_id::DECIMAL32 || n_type == cudf::type_id::DECIMAL64;
+  return n_type == cudf::type_id::DECIMAL32 || n_type == cudf::type_id::DECIMAL64 ||
+         n_type == cudf::type_id::DECIMAL128;
 }
 
 // create data_type including scale for decimal type
diff --git a/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
index d5d4059d18d..2a11b24b3a8 100644
--- a/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
@@ -21,7 +21,6 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 
-import ai.rapids.cudf.HostColumnVector.BasicType;
 import ai.rapids.cudf.HostColumnVector.ListType;
 import ai.rapids.cudf.HostColumnVector.StructType;
 
@@ -40,7 +39,7 @@
 
 import org.junit.jupiter.api.Test;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
diff --git a/java/src/test/java/ai/rapids/cudf/AssertUtils.java b/java/src/test/java/ai/rapids/cudf/AssertUtils.java
new file mode 100644
index 00000000000..184e7dd0c57
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/AssertUtils.java
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/** Utility methods for asserting in unit tests */
+public class AssertUtils {
+
+  /**
+   * Checks and asserts that passed in columns match
+   * @param expect The expected result column
+   * @param cv The input column
+   */
+  public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) {
+    assertColumnsAreEqual(expect, cv, "unnamed");
+  }
+
+  /**
+   * Checks and asserts that passed in columns match
+   * @param expected The expected result column
+   * @param cv The input column
+   * @param colName The name of the column
+   */
+  public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) {
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
+  }
+
+  /**
+   * Checks and asserts that passed in host columns match
+   * @param expected The expected result host column
+   * @param cv The input host column
+   * @param colName The name of the host column
+   */
+  public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) {
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
+  }
+
+  /**
+   * Checks and asserts that passed in Struct columns match
+   * @param expected The expected result Struct column
+   * @param cv The input Struct column
+   */
+  public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) {
+    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false);
+  }
+
+  /**
+   * Checks and asserts that passed in Struct columns match
+   * @param expected The expected result Struct column
+   * @param rowOffset The row number to look from
+   * @param length The number of rows to consider
+   * @param cv The input Struct column
+   * @param colName The name of the column
+   * @param enableNullCountCheck Whether to check for nulls in the Struct column
+   * @param enableNullabilityCheck Whether the table have a validity mask
+   */
+  public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length,
+      ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
+    try (HostColumnVector hostExpected = expected.copyToHost();
+         HostColumnVector hostcv = cv.copyToHost()) {
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck);
+    }
+  }
+
+  /**
+   * Checks and asserts that passed in columns match
+   * @param expected The expected result column
+   * @param cv The input column
+   * @param colName The name of the column
+   * @param enableNullCheck Whether to check for nulls in the column
+   * @param enableNullabilityCheck Whether the table have a validity mask
+   */
+  public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length,
+      ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) {
+    try (HostColumnVector hostExpected = expected.copyToHost();
+         HostColumnVector hostcv = cv.copyToHost()) {
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck);
+    }
+  }
+
+  /**
+   * Checks and asserts that passed in host columns match
+   * @param expected The expected result host column
+   * @param rowOffset start row index
+   * @param length  number of rows from starting offset
+   * @param cv The input host column
+   * @param colName The name of the host column
+   * @param enableNullCountCheck Whether to check for nulls in the host column
+   */
+  public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length,
+                                                  HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
+    assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName);
+    assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName);
+    assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName);
+    if (enableNullCountCheck) {
+      assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName);
+    } else {
+      // TODO add in a proper check when null counts are supported by serializing a partitioned column
+    }
+    if (enableNullabilityCheck) {
+      assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected");
+    }
+    DType type = expected.getType();
+    for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) {
+      long tableRow = expectedRow - rowOffset;
+      assertEquals(expected.isNull(expectedRow), cv.isNull(tableRow),
+          "NULL for Column " + colName + " Row " + tableRow);
+      if (!expected.isNull(expectedRow)) {
+        switch (type.typeId) {
+          case BOOL8: // fall through
+          case INT8: // fall through
+          case UINT8:
+            assertEquals(expected.getByte(expectedRow), cv.getByte(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case INT16: // fall through
+          case UINT16:
+            assertEquals(expected.getShort(expectedRow), cv.getShort(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case INT32: // fall through
+          case UINT32: // fall through
+          case TIMESTAMP_DAYS:
+          case DURATION_DAYS:
+          case DECIMAL32:
+            assertEquals(expected.getInt(expectedRow), cv.getInt(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case INT64: // fall through
+          case UINT64: // fall through
+          case DURATION_MICROSECONDS: // fall through
+          case DURATION_MILLISECONDS: // fall through
+          case DURATION_NANOSECONDS: // fall through
+          case DURATION_SECONDS: // fall through
+          case TIMESTAMP_MICROSECONDS: // fall through
+          case TIMESTAMP_MILLISECONDS: // fall through
+          case TIMESTAMP_NANOSECONDS: // fall through
+          case TIMESTAMP_SECONDS:
+          case DECIMAL64:
+            assertEquals(expected.getLong(expectedRow), cv.getLong(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case DECIMAL128:
+            assertEquals(expected.getBigDecimal(expectedRow), cv.getBigDecimal(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case FLOAT32:
+            CudfTestBase.assertEqualsWithinPercentage(expected.getFloat(expectedRow), cv.getFloat(tableRow), 0.0001,
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case FLOAT64:
+            CudfTestBase.assertEqualsWithinPercentage(expected.getDouble(expectedRow), cv.getDouble(tableRow), 0.0001,
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case STRING:
+            assertArrayEquals(expected.getUTF8(expectedRow), cv.getUTF8(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case LIST:
+            HostMemoryBuffer expectedOffsets = expected.getOffsets();
+            HostMemoryBuffer cvOffsets = cv.getOffsets();
+            int expectedChildRows = expectedOffsets.getInt((expectedRow + 1) * 4) -
+                expectedOffsets.getInt(expectedRow * 4);
+            int cvChildRows = cvOffsets.getInt((tableRow + 1) * 4) -
+                cvOffsets.getInt(tableRow * 4);
+            assertEquals(expectedChildRows, cvChildRows, "Child row count for Column " +
+                colName + " Row " + tableRow);
+            break;
+          case STRUCT:
+            // parent column only has validity which was checked above
+            break;
+          default:
+            throw new IllegalArgumentException(type + " is not supported yet");
+        }
+      }
+    }
+
+    if (type.isNestedType()) {
+      switch (type.typeId) {
+        case LIST:
+          int expectedChildRowOffset = 0;
+          int numChildRows = 0;
+          if (length > 0) {
+            HostMemoryBuffer expectedOffsets = expected.getOffsets();
+            HostMemoryBuffer cvOffsets = cv.getOffsets();
+            expectedChildRowOffset = expectedOffsets.getInt(rowOffset * 4);
+            numChildRows = expectedOffsets.getInt((rowOffset + length) * 4) -
+                expectedChildRowOffset;
+          }
+          assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset,
+              numChildRows, cv.getNestedChildren().get(0), colName + " list child",
+              enableNullCountCheck, enableNullabilityCheck);
+          break;
+        case STRUCT:
+          List<HostColumnVectorCore> expectedChildren = expected.getNestedChildren();
+          List<HostColumnVectorCore> cvChildren = cv.getNestedChildren();
+          for (int i = 0; i < expectedChildren.size(); i++) {
+            HostColumnVectorCore expectedChild = expectedChildren.get(i);
+            HostColumnVectorCore cvChild = cvChildren.get(i);
+            String childName = colName + " child " + i;
+            assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName);
+            assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild,
+                colName, enableNullCountCheck, enableNullabilityCheck);
+          }
+          break;
+        default:
+          throw new IllegalArgumentException(type + " is not supported yet");
+      }
+    }
+  }
+
+  /**
+   * Checks and asserts that the two tables from a given rowindex match based on a provided schema.
+   * @param expected the expected result table
+   * @param rowOffset the row number to start checking from
+   * @param length the number of rows to check
+   * @param table the input table to compare against expected
+   * @param enableNullCheck whether to check for nulls or not
+   * @param enableNullabilityCheck whether the table have a validity mask
+   */
+  public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table,
+                                                 boolean enableNullCheck, boolean enableNullabilityCheck) {
+    assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns());
+    assertEquals(length, table.getRowCount(), "ROW COUNT");
+    for (int col = 0; col < expected.getNumberOfColumns(); col++) {
+      ColumnVector expect = expected.getColumn(col);
+      ColumnVector cv = table.getColumn(col);
+      String name = String.valueOf(col);
+      if (rowOffset != 0 || length != expected.getRowCount()) {
+        name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1);
+      }
+      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck);
+    }
+  }
+
+  /**
+   * Checks and asserts that the two tables match
+   * @param expected the expected result table
+   * @param table the input table to compare against expected
+   */
+  public static void assertTablesAreEqual(Table expected, Table table) {
+    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false);
+  }
+
+  public static void assertTableTypes(DType[] expectedTypes, Table t) {
+    int len = t.getNumberOfColumns();
+    assertEquals(expectedTypes.length, len);
+    for (int i = 0; i < len; i++) {
+      ColumnVector vec = t.getColumn(i);
+      DType type = vec.getType();
+      assertEquals(expectedTypes[i], type, "Types don't match at " + i);
+    }
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
index df4afb5ff60..0ca997d3c80 100644
--- a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
+++ b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
@@ -22,11 +22,12 @@
 import org.junit.jupiter.api.Test;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.util.Arrays;
 import java.util.stream.IntStream;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static ai.rapids.cudf.TestUtils.*;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
@@ -54,7 +55,12 @@ public class BinaryOpTest extends CudfTestBase {
   private static final int[] DECIMAL32_1 = new int[]{1000, 2000, 3000, 4000, 5000};
   private static final int[] DECIMAL32_2 = new int[]{100, 200, 300, 400, 50};
   private static final long[] DECIMAL64_1 = new long[]{10L, 23L, 12L, 24L, 123456789L};
-  private static final long[] DECIMAL64_2 = new long[]{20L, 13L, 22L, 14L, 132457689L};
+  private static final long[] DECIMAL64_2 = new long[]{33041L, 97290L, 36438L, 25379L, 48473L};
+
+  private static final BigInteger[] DECIMAL128_1 = new BigInteger[]{new BigInteger("1234567891234567"), new BigInteger("1234567891234567"),
+      new BigInteger("1234567891234567"), new BigInteger("1234567891234567"), new BigInteger("1234567891234567")};
+  private static final BigInteger[] DECIMAL128_2 = new BigInteger[]{new BigInteger("234567891234567"), new BigInteger("234567891234567"),
+      new BigInteger("234567891234567"), new BigInteger("234567891234567"), new BigInteger("234567891234567")};
 
   private static final BigDecimal[] BIGDECIMAL32_1 = new BigDecimal[]{
           BigDecimal.valueOf(12, dec32Scale_1),
@@ -250,7 +256,9 @@ public void testAdd() {
          ColumnVector dec32cv1 = ColumnVector.fromDecimals(BIGDECIMAL32_1);
          ColumnVector dec32cv2 = ColumnVector.fromDecimals(BIGDECIMAL32_2);
          ColumnVector dec64cv1 = ColumnVector.decimalFromLongs(-dec64Scale_1, DECIMAL64_1);
-         ColumnVector dec64cv2 = ColumnVector.decimalFromLongs(-dec64Scale_2, DECIMAL64_2)) {
+         ColumnVector dec64cv2 = ColumnVector.decimalFromLongs(-dec64Scale_2, DECIMAL64_2);
+         ColumnVector dec128cv1 = ColumnVector.decimalFromBigInt(-dec64Scale_1, DECIMAL128_1);
+         ColumnVector dec128cv2 = ColumnVector.decimalFromBigInt(-dec64Scale_2, DECIMAL128_2)) {
       try (ColumnVector add = icv1.add(icv2);
            ColumnVector expected = forEach(DType.INT32, icv1, icv2,
                    (b, l, r, i) -> b.append(l.getInt(i) + r.getInt(i)))) {
@@ -331,6 +339,14 @@ public void testAdd() {
         }
       }
 
+      try (ColumnVector add = dec128cv1.add(dec128cv2)) {
+        try (ColumnVector expected = forEach(
+            DType.create(DType.DTypeEnum.DECIMAL128, -6), dec128cv1, dec128cv2,
+            (b, l, r, i) -> b.append(l.getBigDecimal(i).add(r.getBigDecimal(i))))) {
+          assertColumnsAreEqual(expected, add, "dec128");
+        }
+      }
+
       try (Scalar s = Scalar.fromDecimal(2, 100);
            ColumnVector add = dec32cv1.add(s)) {
         try (ColumnVector expected = forEachS(
@@ -381,7 +397,9 @@ public void testSub() {
          ColumnVector dec32cv1 = ColumnVector.fromDecimals(BIGDECIMAL32_1);
          ColumnVector dec32cv2 = ColumnVector.fromDecimals(BIGDECIMAL32_2);
          ColumnVector dec64cv1 = ColumnVector.decimalFromLongs(-dec64Scale_1, DECIMAL64_1);
-         ColumnVector dec64cv2 = ColumnVector.decimalFromLongs(-dec64Scale_2, DECIMAL64_2)) {
+         ColumnVector dec64cv2 = ColumnVector.decimalFromLongs(-dec64Scale_2, DECIMAL64_2);
+         ColumnVector dec128cv1 = ColumnVector.decimalFromBigInt(-dec64Scale_1, DECIMAL128_1);
+         ColumnVector dec128cv2 = ColumnVector.decimalFromBigInt(-dec64Scale_2, DECIMAL128_2)) {
       try (ColumnVector sub = icv1.sub(icv2);
            ColumnVector expected = forEach(DType.INT32, icv1, icv2,
                    (b, l, r, i) -> b.append(l.getInt(i) - r.getInt(i)))) {
@@ -473,6 +491,14 @@ public void testSub() {
         }
       }
 
+      try (ColumnVector sub = dec128cv1.sub(dec128cv2)) {
+        try (ColumnVector expected = forEach(
+            DType.create(DType.DTypeEnum.DECIMAL128, -6), dec128cv1, dec128cv2,
+            (b, l, r, i) -> b.append(l.getBigDecimal(i).subtract(r.getBigDecimal(i))))) {
+          assertColumnsAreEqual(expected, sub, "dec128");
+        }
+      }
+
       try (Scalar s = Scalar.fromFloat(1.1f);
            ColumnVector sub = lcv1.sub(s);
            ColumnVector expected = forEachS(DType.FLOAT32, lcv1, 1.1f,
@@ -507,7 +533,9 @@ public void testMul() {
          ColumnVector dec32cv1 = ColumnVector.fromDecimals(BIGDECIMAL32_1);
          ColumnVector dec32cv2 = ColumnVector.fromDecimals(BIGDECIMAL32_2);
          ColumnVector dec64cv1 = ColumnVector.decimalFromLongs(-dec64Scale_1, DECIMAL64_1);
-         ColumnVector dec64cv2 = ColumnVector.decimalFromLongs(-dec64Scale_2, DECIMAL64_2)) {
+         ColumnVector dec64cv2 = ColumnVector.decimalFromLongs(-dec64Scale_2, DECIMAL64_2);
+         ColumnVector dec128cv1 = ColumnVector.decimalFromBigInt(-dec64Scale_1, DECIMAL128_1);
+         ColumnVector dec128cv2 = ColumnVector.decimalFromBigInt(-dec64Scale_2, DECIMAL128_2)) {
       try (ColumnVector answer = icv.mul(dcv);
            ColumnVector expected = forEach(DType.FLOAT64, icv, dcv,
                    (b, l, r, i) -> b.append(l.getInt(i) * r.getDouble(i)))) {
@@ -560,6 +588,14 @@ public void testMul() {
                    (b, l, r, i) -> b.append(Short.toUnsignedInt(l) * r.getInt(i)))) {
         assertColumnsAreEqual(expected, answer, "scalar uint16 * uint32");
       }
+
+      try (ColumnVector mul = dec128cv1.mul(dec128cv2)) {
+        try (ColumnVector expected = forEach(
+            DType.create(DType.DTypeEnum.DECIMAL128, dec128cv1.type.getScale() + dec128cv2.type.getScale()), dec128cv1, dec128cv2,
+            (b, l, r, i) -> b.append(l.getBigDecimal(i).multiply(r.getBigDecimal(i))))) {
+          assertColumnsAreEqual(expected, mul, "dec128");
+        }
+      }
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
index 878fa7e4516..a26dbec4907 100644
--- a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
@@ -127,9 +127,9 @@ public void testCastToByte() {
          ColumnVector expected1 = ColumnVector.fromBytes((byte)4, (byte)3, (byte)8);
          ColumnVector expected2 = ColumnVector.fromBytes((byte)100);
          ColumnVector expected3 = ColumnVector.fromBytes((byte)-23)) {
-      TableTest.assertColumnsAreEqual(expected1, byteColumnVector1);
-      TableTest.assertColumnsAreEqual(expected2, byteColumnVector2);
-      TableTest.assertColumnsAreEqual(expected3, byteColumnVector3);
+      AssertUtils.assertColumnsAreEqual(expected1, byteColumnVector1);
+      AssertUtils.assertColumnsAreEqual(expected2, byteColumnVector2);
+      AssertUtils.assertColumnsAreEqual(expected3, byteColumnVector3);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 4d52862f7b0..7120a40a26a 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -23,6 +23,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
@@ -33,8 +34,10 @@
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertStructColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual;
 import static ai.rapids.cudf.QuantileMethod.*;
-import static ai.rapids.cudf.TableTest.*;
 import static org.junit.jupiter.api.Assertions.*;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
@@ -85,8 +88,8 @@ void testTransformVector() {
          ColumnVector cv1 = cv.transform(ptx, true);
          ColumnVector cv2 = cv.transform(cuda, false);
          ColumnVector expected = ColumnVector.fromBoxedInts(2*2-2, 3*3-3, null, 4*4-4)) {
-      TableTest.assertColumnsAreEqual(expected, cv1);
-      TableTest.assertColumnsAreEqual(expected, cv2);
+      assertColumnsAreEqual(expected, cv1);
+      assertColumnsAreEqual(expected, cv2);
     }
   }
 
@@ -251,7 +254,7 @@ void testStringCreation() {
     try (ColumnVector cv = ColumnVector.fromStrings("d", "sd", "sde", null, "END");
          HostColumnVector host = cv.copyToHost();
          ColumnVector backAgain = host.copyToDevice()) {
-      TableTest.assertColumnsAreEqual(cv, backAgain);
+      assertColumnsAreEqual(cv, backAgain);
     }
   }
 
@@ -264,7 +267,7 @@ void testUTF8StringCreation() {
             null,
             "END".getBytes(StandardCharsets.UTF_8));
          ColumnVector expected = ColumnVector.fromStrings("d", "sd", "sde", null, "END")) {
-      TableTest.assertColumnsAreEqual(expected, cv);
+      assertColumnsAreEqual(expected, cv);
     }
   }
 
@@ -298,7 +301,7 @@ void testConcatNoNulls() {
          ColumnVector v2 = ColumnVector.fromInts(8, 9);
          ColumnVector v = ColumnVector.concatenate(v0, v1, v2);
          ColumnVector expected = ColumnVector.fromInts(1, 2, 3, 4, 5, 6, 7, 8, 9)) {
-      TableTest.assertColumnsAreEqual(expected, v);
+      assertColumnsAreEqual(expected, v);
     }
   }
 
@@ -309,7 +312,7 @@ void testConcatWithNulls() {
          ColumnVector v2 = ColumnVector.fromBoxedDoubles(null, 9.0);
          ColumnVector v = ColumnVector.concatenate(v0, v1, v2);
          ColumnVector expected = ColumnVector.fromBoxedDoubles(1., 2., 3., 4., 5., 6., 7., null, 9.)) {
-      TableTest.assertColumnsAreEqual(expected, v);
+      assertColumnsAreEqual(expected, v);
     }
   }
 
@@ -1047,6 +1050,21 @@ void roundDecimal() {
     }
   }
 
+  @Test
+  void decimal128Cv() {
+    final int dec32Scale1 = -2;
+    BigInteger bigInteger1 = new BigInteger("-831457");
+    BigInteger bigInteger2 = new BigInteger("14");
+    BigInteger bigInteger3 = new BigInteger("152345742357340573405745");
+    final BigInteger[] bigInts = new BigInteger[] {bigInteger1, bigInteger2, bigInteger3};
+    try (ColumnVector v = ColumnVector.decimalFromBigInt(-dec32Scale1, bigInts)) {
+      HostColumnVector hostColumnVector = v.copyToHost();
+      assertEquals(bigInteger1, hostColumnVector.getBigDecimal(0).unscaledValue());
+      assertEquals(bigInteger2, hostColumnVector.getBigDecimal(1).unscaledValue());
+      assertEquals(bigInteger3, hostColumnVector.getBigDecimal(2).unscaledValue());
+    }
+  }
+
   @Test
   void testGetDeviceMemorySizeNonStrings() {
     try (ColumnVector v0 = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6);
@@ -1244,6 +1262,9 @@ void testFromScalarZeroRows() {
         case DECIMAL64:
           s = Scalar.fromDecimal(mockScale, 1234567890123456789L);
           break;
+        case DECIMAL128:
+          s = Scalar.fromDecimal(mockScale, new BigInteger("1234567890123456789"));
+          break;
         case TIMESTAMP_DAYS:
           s = Scalar.timestampDaysFromInt(12345);
           break;
@@ -1863,13 +1884,13 @@ void testSubvector() {
     try (ColumnVector vec = ColumnVector.fromBoxedInts(1, 2, 3, null, 5);
          ColumnVector expected = ColumnVector.fromBoxedInts(2, 3, null, 5);
          ColumnVector found = vec.subVector(1, 5)) {
-      TableTest.assertColumnsAreEqual(expected, found);
+      assertColumnsAreEqual(expected, found);
     }
 
     try (ColumnVector vec = ColumnVector.fromStrings("1", "2", "3", null, "5");
          ColumnVector expected = ColumnVector.fromStrings("2", "3", null, "5");
          ColumnVector found = vec.subVector(1, 5)) {
-      TableTest.assertColumnsAreEqual(expected, found);
+      assertColumnsAreEqual(expected, found);
     }
   }
 
@@ -1995,7 +2016,7 @@ void testTrimStringsWhiteSpace() {
     try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n");
          ColumnVector trimmed = cv.strip();
          ColumnVector expected = ColumnVector.fromStrings("123", "123", null, "123", "123")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2005,7 +2026,7 @@ void testTrimStrings() {
          Scalar one = Scalar.fromString(" 1");
          ColumnVector trimmed = cv.strip(one);
          ColumnVector expected = ColumnVector.fromStrings("23", "23", null, "23", "\t\t123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2014,7 +2035,7 @@ void testLeftTrimStringsWhiteSpace() {
     try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n");
          ColumnVector trimmed = cv.lstrip();
          ColumnVector expected = ColumnVector.fromStrings("123", "123 ", null, "123 ", "123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2024,7 +2045,7 @@ void testLeftTrimStrings() {
          Scalar one = Scalar.fromString(" 1");
          ColumnVector trimmed = cv.lstrip(one);
          ColumnVector expected = ColumnVector.fromStrings("23", "23 ", null, "231", "\t\t123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2033,7 +2054,7 @@ void testRightTrimStringsWhiteSpace() {
     try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n");
          ColumnVector trimmed = cv.rstrip();
          ColumnVector expected = ColumnVector.fromStrings(" 123", "123", null, " 123", "\t\t123")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2043,7 +2064,7 @@ void testRightTrimStrings() {
          Scalar one = Scalar.fromString(" 1");
          ColumnVector trimmed = cv.rstrip(one);
          ColumnVector expected = ColumnVector.fromStrings("123", "123", null, "123", "\t\t123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2089,7 +2110,7 @@ void testCountElements() {
         Arrays.asList(1, 2, 3), Arrays.asList(1, 2, 3, 4));
          ColumnVector lengths = cv.countElements();
          ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 2, 3, 4)) {
-      TableTest.assertColumnsAreEqual(expected, lengths);
+      assertColumnsAreEqual(expected, lengths);
     }
   }
 
@@ -2098,7 +2119,7 @@ void testStringLengths() {
     try (ColumnVector cv = ColumnVector.fromStrings("1", "12", null, "123", "1234");
       ColumnVector lengths = cv.getCharLengths();
       ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 3, 4)) {
-      TableTest.assertColumnsAreEqual(expected, lengths);
+      assertColumnsAreEqual(expected, lengths);
     }
   }
 
@@ -2107,7 +2128,7 @@ void testGetByteCount() {
     try (ColumnVector cv = ColumnVector.fromStrings("1", "12", "123", null, "1234");
          ColumnVector byteLengthVector = cv.getByteCount();
          ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, 3, null, 4)) {
-      TableTest.assertColumnsAreEqual(expected, byteLengthVector);
+      assertColumnsAreEqual(expected, byteLengthVector);
     }
   }
 
@@ -3351,6 +3372,52 @@ void testFixedWidthCast() {
     }
   }
 
+  @Test
+  void testCastBigDecimalToString() {
+    BigDecimal[] bigValues = {new BigDecimal("923121331938210123.321"),
+        new BigDecimal("9223372036854775808.191"),
+        new BigDecimal("-9.223"),
+        new BigDecimal("0.000"),
+        new BigDecimal("9328323982309091029831.002")
+    };
+
+    try (ColumnVector cv = ColumnVector.fromDecimals(bigValues);
+         ColumnVector values = cv.castTo(DType.STRING);
+         ColumnVector expected = ColumnVector.fromStrings("923121331938210123.321",
+             "9223372036854775808.191",
+             "-9.223",
+             "0.000",
+            "9328323982309091029831.002")) {
+      assertColumnsAreEqual(expected, values);
+    }
+
+    BigDecimal[] bigValues0 = {new BigDecimal("992983283728193827182918744829283742232")};
+    try {
+      ColumnVector cv = ColumnVector.fromDecimals(bigValues0);
+      if (cv != null) {
+        cv.close();
+      }
+      fail("Precision check should've thrown an IllegalArgumentException");
+    } catch (IllegalArgumentException iae) {
+    }
+  }
+
+  @Test
+  void testCastStringToBigDecimal() {
+    String[] bigValues = {"923121331938210123.321",
+        "9223372036854775808.191",
+       "9328323982309091029831.002"
+    };
+
+    try (ColumnVector cv = ColumnVector.fromStrings(bigValues);
+        ColumnVector values = cv.castTo(DType.create(DType.DTypeEnum.DECIMAL128, -3));
+        ColumnVector expected = ColumnVector.fromDecimals(new BigDecimal("923121331938210123.321"),
+            new BigDecimal("9223372036854775808.191"),
+            new BigDecimal("9328323982309091029831.002"))) {
+      assertColumnsAreEqual(expected, values);
+    }
+  }
+
   @Test
   void testCastByteToString() {
 
@@ -3542,6 +3609,30 @@ void testCastLongToDecimal() {
     );
   }
 
+  @Test
+  void testCastDecimal64ToDecimal128() {
+    testCastDecimal128(DType.DTypeEnum.DECIMAL64, DType.DTypeEnum.DECIMAL128, 0,
+        () -> ColumnVector.fromBoxedLongs(1L, -21L, 345L, null, 8008L, Long.MIN_VALUE, Long.MAX_VALUE),
+        () -> ColumnVector.fromDecimals(new BigDecimal(1), new BigDecimal(-21), new BigDecimal(345),
+            null, new BigDecimal(8008), new BigDecimal(Long.MIN_VALUE), new BigDecimal(Long.MAX_VALUE)),
+        new BigInteger[]{new BigInteger("1"), new BigInteger("-21"),
+            new BigInteger("345"), null, new BigInteger("8008"),
+            new BigInteger(String.valueOf(Long.MIN_VALUE)),
+            new BigInteger(String.valueOf(Long.MAX_VALUE))}
+    );
+    testCastDecimal128(DType.DTypeEnum.DECIMAL32, DType.DTypeEnum.DECIMAL128, 0,
+        () -> ColumnVector.fromBoxedInts(1, 21, 345, null, 8008, Integer.MIN_VALUE, Integer.MAX_VALUE),
+        () -> ColumnVector.decimalFromBigInt(0, new BigInteger("1"), new BigInteger("21"),
+            new BigInteger("345"), null, new BigInteger("8008"),
+            new BigInteger(String.valueOf(Integer.MIN_VALUE)),
+            new BigInteger(String.valueOf(Integer.MAX_VALUE))),
+        new BigInteger[]{new BigInteger("1"), new BigInteger("21"),
+            new BigInteger("345"), null, new BigInteger("8008"),
+            new BigInteger(String.valueOf(Integer.MIN_VALUE)),
+            new BigInteger(String.valueOf(Integer.MAX_VALUE))}
+    );
+  }
+
   @Test
   void testCastFloatToDecimal() {
     testCastNumericToDecimalsAndBack(DType.FLOAT32, true, 0,
@@ -3635,6 +3726,26 @@ private static void testCastNumericToDecimalsAndBack(DType sourceType, boolean i
     }
   }
 
+  private static void testCastDecimal128(DType.DTypeEnum sourceType, DType.DTypeEnum targetType, int scale,
+                                         Supplier<ColumnVector> sourceData,
+                                         Supplier<ColumnVector> returnData,
+                                         Object[] unscaledDecimal) {
+    DType decimalType = DType.create(targetType, scale);
+    try (ColumnVector sourceColumn = sourceData.get();
+         ColumnVector expectedColumn = returnData.get();
+         ColumnVector decimalColumn = sourceColumn.castTo(decimalType);
+         HostColumnVector hostDecimalColumn = decimalColumn.copyToHost();
+         ColumnVector returnColumn = decimalColumn.castTo(DType.create(decimalType.typeId, scale))) {
+      for (int i = 0; i < sourceColumn.rows; i++) {
+        Object actual = hostDecimalColumn.isNull(i) ? null :
+            (decimalType.typeId == DType.DTypeEnum.DECIMAL128 ? hostDecimalColumn.getBigDecimal(i).unscaledValue() :
+                ((decimalType.typeId == DType.DTypeEnum.DECIMAL64) ? hostDecimalColumn.getLong(i) : hostDecimalColumn.getInt(i)));
+        assertEquals(unscaledDecimal[i], actual);
+      }
+      assertColumnsAreEqual(expectedColumn, returnColumn);
+    }
+  }
+
   @Test
   void testIsTimestamp() {
       final String[] TIMESTAMP_STRINGS = {
@@ -4840,11 +4951,12 @@ void testIsFloat() {
     try (ColumnVector floatStringCV = ColumnVector.fromStrings(floatStrings);
          ColumnVector isFloat = floatStringCV.isFloat();
          ColumnVector floats = floatStringCV.asFloats();
-         ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, 0f, Float.POSITIVE_INFINITY,
-             Float.NEGATIVE_INFINITY, 0f, 0f, -0f, 0f, Float.MAX_VALUE, Float.POSITIVE_INFINITY,
-             -Float.MAX_VALUE, Float.NEGATIVE_INFINITY, 1.2e-24f, 0f, 0f, null, 423f);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false,
-             false, true, true, true, true, true, true, true, false, false, null, true)) {
+         ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, Float.NaN, Float.POSITIVE_INFINITY,
+             Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY, Float.POSITIVE_INFINITY, -0f, 0f,
+             Float.MAX_VALUE, Float.POSITIVE_INFINITY, -Float.MAX_VALUE, Float.NEGATIVE_INFINITY,
+             1.2e-24f, 0f, 0f, null, 423f);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true,
+             true, true, true, true, true, true, true, true, false, false, null, true)) {
       assertColumnsAreEqual(expected, isFloat);
       assertColumnsAreEqual(expectedFloats, floats);
     }
@@ -4865,12 +4977,12 @@ void testIsDouble() {
     try (ColumnVector doubleStringCV = ColumnVector.fromStrings(doubleStrings);
          ColumnVector isDouble = doubleStringCV.isFloat();
          ColumnVector doubles = doubleStringCV.asDoubles();
-         ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, 0d,
-             Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, 0d, 0d, -0d, 0d, Double.MAX_VALUE,
-             Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY, 1.2e-234d, 0d,
-             0d, null, 423d);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false,
-             false, true, true, true, true, true, true, true, false, false, null, true)) {
+         ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, Double.NaN,
+             Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY,
+             -0d, 0d, Double.MAX_VALUE, Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY,
+             1.2e-234d, 0d, 0d, null, 423d);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true,
+             true, true, true, true, true, true, true, true, false, false, null, true)) {
       assertColumnsAreEqual(expected, isDouble);
       assertColumnsAreEqual(expectedDoubles, doubles);
     }
diff --git a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
index 5f4d20dc8e3..c2772520f57 100644
--- a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
@@ -51,6 +51,10 @@ public class DecimalColumnVectorTest extends CudfTestBase {
   private final BigDecimal[] overflowDecimal64 = new BigDecimal[]{
       BigDecimal.valueOf(Long.MAX_VALUE), BigDecimal.valueOf(Long.MIN_VALUE)};
 
+  private final BigDecimal[] overflowDecimal128 = new BigDecimal[]{
+      new BigDecimal("340282367000000000000000000000000000001"),
+      new BigDecimal("-340282367000000000000000000000000000001")};
+
   @BeforeAll
   public static void setup() {
     for (int i = 0; i < decimal32Zoo.length; i++) {
@@ -139,7 +143,8 @@ public void testOverrunningTheBuffer() {
   @Test
   public void testDecimalValidation() {
     // precision overflow
-    assertThrows(IllegalArgumentException.class, () -> HostColumnVector.fromDecimals(overflowDecimal64));
+    assertThrows(IllegalArgumentException.class, () -> HostColumnVector.fromDecimals(overflowDecimal128));
+
     assertThrows(IllegalArgumentException.class, () -> {
       try (ColumnVector ignored = ColumnVector.decimalFromInts(
           -(DType.DECIMAL32_MAX_PRECISION + 1), unscaledDec32Zoo)) {
@@ -153,13 +158,13 @@ public void testDecimalValidation() {
     // precision overflow due to rescaling by min scale
     assertThrows(IllegalArgumentException.class, () -> {
       try (ColumnVector ignored = ColumnVector.fromDecimals(
-          BigDecimal.valueOf(1.23e10), BigDecimal.valueOf(1.2e-7))) {
+          BigDecimal.valueOf(1.23e30), BigDecimal.valueOf(1.2e-7))) {
       }
     });
-    // exactly hit the MAX_PRECISION_DECIMAL64 after rescaling
+    // exactly hit the MAX_PRECISION_DECIMAL128 after rescaling
     assertDoesNotThrow(() -> {
       try (ColumnVector ignored = ColumnVector.fromDecimals(
-          BigDecimal.valueOf(1.23e10), BigDecimal.valueOf(1.2e-6))) {
+          BigDecimal.valueOf(1.23e30), BigDecimal.valueOf(1.2e-6))) {
       }
     });
   }
@@ -170,6 +175,10 @@ public void testDecimalGeneral() {
     try (ColumnVector cv = ColumnVector.fromDecimals(overflowDecimal32)) {
       assertEquals(DType.create(DType.DTypeEnum.DECIMAL64, 0), cv.getType());
     }
+
+    try (ColumnVector cv = ColumnVector.fromDecimals(overflowDecimal64)) {
+      assertEquals(DType.create(DType.DTypeEnum.DECIMAL128, 0), cv.getType());
+    }
     // Create DECIMAL64 vector with small values
     try (ColumnVector cv =  ColumnVector.decimalFromLongs(0, 0L)) {
       try (HostColumnVector hcv = cv.copyToHost()) {
diff --git a/java/src/test/java/ai/rapids/cudf/IfElseTest.java b/java/src/test/java/ai/rapids/cudf/IfElseTest.java
index 86ddcc23416..a078befdf40 100644
--- a/java/src/test/java/ai/rapids/cudf/IfElseTest.java
+++ b/java/src/test/java/ai/rapids/cudf/IfElseTest.java
@@ -25,7 +25,7 @@
 
 import java.util.stream.Stream;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
 public class IfElseTest extends CudfTestBase {
diff --git a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
index dd03c4de69e..2fb8164534b 100644
--- a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
@@ -117,8 +117,8 @@ public void testCastToInt() {
          ColumnVector expected1 = ColumnVector.fromInts(4, 3, 8);
          ColumnVector intColumnVector2 = shortColumnVector.asInts();
          ColumnVector expected2 = ColumnVector.fromInts(100)) {
-      TableTest.assertColumnsAreEqual(expected1, intColumnVector1);
-      TableTest.assertColumnsAreEqual(expected2, intColumnVector2);
+      AssertUtils.assertColumnsAreEqual(expected1, intColumnVector1);
+      AssertUtils.assertColumnsAreEqual(expected2, intColumnVector2);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
index 37fd2ecb714..86c340bb321 100644
--- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
@@ -25,10 +25,11 @@
 import org.junit.jupiter.api.Test;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.*;
 
 public class ScalarTest extends CudfTestBase {
@@ -186,6 +187,7 @@ public void testDecimal() {
         BigDecimal.valueOf(1234, 0),
         BigDecimal.valueOf(12345678, 2),
         BigDecimal.valueOf(1234567890123L, 6),
+        new BigDecimal(new BigInteger("12312341234123412341234123412341234120"), 4)
     };
     for (BigDecimal dec : bigDecimals) {
       try (Scalar s = Scalar.fromDecimal(dec)) {
@@ -194,17 +196,24 @@ public void testDecimal() {
         assertTrue(s.isValid());
         if (dtype.getTypeId() == DType.DTypeEnum.DECIMAL64) {
           assertEquals(dec.unscaledValue().longValueExact(), s.getLong());
-        } else {
+        } else if (dtype.getTypeId() == DType.DTypeEnum.DECIMAL32) {
           assertEquals(dec.unscaledValue().intValueExact(), s.getInt());
+        } else if (dtype.getTypeId() == DType.DTypeEnum.DECIMAL128) {
+          assertEquals(dec.unscaledValue(), s.getBigDecimal().unscaledValue());
         }
         assertEquals(dec, s.getBigDecimal());
       }
+
       try (Scalar s = Scalar.fromDecimal(-dec.scale(), dec.unscaledValue().intValueExact())) {
         assertEquals(dec, s.getBigDecimal());
       } catch (java.lang.ArithmeticException ex) {
         try (Scalar s = Scalar.fromDecimal(-dec.scale(), dec.unscaledValue().longValueExact())) {
           assertEquals(dec, s.getBigDecimal());
           assertTrue(s.getType().isBackedByLong());
+        } catch (java.lang.ArithmeticException e) {
+          try (Scalar s = Scalar.fromDecimal(-dec.scale(), dec.unscaledValue())) {
+            assertEquals(dec, s.getBigDecimal());
+          }
         }
       }
     }
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 280a4d33ae9..0b2f56895e9 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -30,6 +30,8 @@
 import ai.rapids.cudf.ast.ColumnReference;
 import ai.rapids.cudf.ast.CompiledExpression;
 import ai.rapids.cudf.ast.TableReference;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.hadoop.ParquetFileReader;
@@ -45,14 +47,21 @@
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.util.*;
+import java.util.function.Function;
 import java.util.stream.Collectors;
 
 import static ai.rapids.cudf.ColumnWriterOptions.mapColumn;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertPartialColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertPartialTablesAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertTableTypes;
+import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual;
 import static ai.rapids.cudf.ParquetWriterOptions.listBuilder;
 import static ai.rapids.cudf.ParquetWriterOptions.structBuilder;
 import static ai.rapids.cudf.Table.TestBuilder;
@@ -66,11 +75,11 @@
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 public class TableTest extends CudfTestBase {
-  private static final File TEST_PARQUET_FILE = new File("src/test/resources/acq.parquet");
-  private static final File TEST_ORC_FILE = new File("src/test/resources/TestOrcFile.orc");
-  private static final File TEST_ORC_TIMESTAMP_DATE_FILE = new File(
-      "src/test/resources/timestamp-date-test.orc");
-  private static final File TEST_DECIMAL_PARQUET_FILE = new File("src/test/resources/decimal.parquet");
+  private static final File TEST_PARQUET_FILE = TestUtils.getResourceAsFile("acq.parquet");
+  private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc");
+  private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc");
+  private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet");
+  private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
 
   private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder()
       .column(DType.INT32, "A")
@@ -90,238 +99,6 @@ public class TableTest extends CudfTestBase {
       "8|118.2|128\n" +
       "9|119.8|129").getBytes(StandardCharsets.UTF_8);
 
-  /**
-   * Checks and asserts that passed in columns match
-   * @param expect The expected result column
-   * @param cv The input column
-   */
-  public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) {
-    assertColumnsAreEqual(expect, cv, "unnamed");
-  }
-
-  /**
-   * Checks and asserts that passed in columns match
-   * @param expected The expected result column
-   * @param cv The input column
-   * @param colName The name of the column
-   */
-  public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
-  }
-
-  /**
-   * Checks and asserts that passed in host columns match
-   * @param expected The expected result host column
-   * @param cv The input host column
-   * @param colName The name of the host column
-   */
-  public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
-  }
-
-  /**
-   * Checks and asserts that passed in Struct columns match
-   * @param expected The expected result Struct column
-   * @param cv The input Struct column
-   */
-  public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) {
-    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false);
-  }
-
-  /**
-   * Checks and asserts that passed in Struct columns match
-   * @param expected The expected result Struct column
-   * @param rowOffset The row number to look from
-   * @param length The number of rows to consider
-   * @param cv The input Struct column
-   * @param colName The name of the column
-   * @param enableNullCountCheck Whether to check for nulls in the Struct column
-   * @param enableNullabilityCheck Whether the table have a validity mask
-   */
-  public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
-    try (HostColumnVector hostExpected = expected.copyToHost();
-         HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck);
-    }
-  }
-
-  /**
-   * Checks and asserts that passed in columns match
-   * @param expected The expected result column
-   * @param cv The input column
-   * @param colName The name of the column
-   * @param enableNullCheck Whether to check for nulls in the column
-   * @param enableNullabilityCheck Whether the table have a validity mask
-   */
-  public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) {
-    try (HostColumnVector hostExpected = expected.copyToHost();
-         HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck);
-    }
-  }
-
-  /**
-   * Checks and asserts that passed in host columns match
-   * @param expected The expected result host column
-   * @param rowOffset start row index
-   * @param length  number of rows from starting offset
-   * @param cv The input host column
-   * @param colName The name of the host column
-   * @param enableNullCountCheck Whether to check for nulls in the host column
-   */
-  public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length,
-                                                  HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
-    assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName);
-    assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName);
-    assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName);
-    if (enableNullCountCheck) {
-      assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName);
-    } else {
-      // TODO add in a proper check when null counts are supported by serializing a partitioned column
-    }
-    if (enableNullabilityCheck) {
-      assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected");
-    }
-    DType type = expected.getType();
-    for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) {
-      long tableRow = expectedRow - rowOffset;
-      assertEquals(expected.isNull(expectedRow), cv.isNull(tableRow),
-          "NULL for Column " + colName + " Row " + tableRow);
-      if (!expected.isNull(expectedRow)) {
-        switch (type.typeId) {
-          case BOOL8: // fall through
-          case INT8: // fall through
-          case UINT8:
-            assertEquals(expected.getByte(expectedRow), cv.getByte(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case INT16: // fall through
-          case UINT16:
-            assertEquals(expected.getShort(expectedRow), cv.getShort(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case INT32: // fall through
-          case UINT32: // fall through
-          case TIMESTAMP_DAYS:
-          case DURATION_DAYS:
-          case DECIMAL32:
-            assertEquals(expected.getInt(expectedRow), cv.getInt(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case INT64: // fall through
-          case UINT64: // fall through
-          case DURATION_MICROSECONDS: // fall through
-          case DURATION_MILLISECONDS: // fall through
-          case DURATION_NANOSECONDS: // fall through
-          case DURATION_SECONDS: // fall through
-          case TIMESTAMP_MICROSECONDS: // fall through
-          case TIMESTAMP_MILLISECONDS: // fall through
-          case TIMESTAMP_NANOSECONDS: // fall through
-          case TIMESTAMP_SECONDS:
-          case DECIMAL64:
-            assertEquals(expected.getLong(expectedRow), cv.getLong(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case FLOAT32:
-            assertEqualsWithinPercentage(expected.getFloat(expectedRow), cv.getFloat(tableRow), 0.0001,
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case FLOAT64:
-            assertEqualsWithinPercentage(expected.getDouble(expectedRow), cv.getDouble(tableRow), 0.0001,
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case STRING:
-            assertArrayEquals(expected.getUTF8(expectedRow), cv.getUTF8(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case LIST:
-            HostMemoryBuffer expectedOffsets = expected.getOffsets();
-            HostMemoryBuffer cvOffsets = cv.getOffsets();
-            int expectedChildRows = expectedOffsets.getInt((expectedRow + 1) * 4) -
-                expectedOffsets.getInt(expectedRow * 4);
-            int cvChildRows = cvOffsets.getInt((tableRow + 1) * 4) -
-                cvOffsets.getInt(tableRow * 4);
-            assertEquals(expectedChildRows, cvChildRows, "Child row count for Column " +
-                colName + " Row " + tableRow);
-            break;
-          case STRUCT:
-            // parent column only has validity which was checked above
-            break;
-          default:
-            throw new IllegalArgumentException(type + " is not supported yet");
-        }
-      }
-    }
-
-    if (type.isNestedType()) {
-      switch (type.typeId) {
-        case LIST:
-          int expectedChildRowOffset = 0;
-          int numChildRows = 0;
-          if (length > 0) {
-            HostMemoryBuffer expectedOffsets = expected.getOffsets();
-            HostMemoryBuffer cvOffsets = cv.getOffsets();
-            expectedChildRowOffset = expectedOffsets.getInt(rowOffset * 4);
-            numChildRows = expectedOffsets.getInt((rowOffset + length) * 4) -
-                expectedChildRowOffset;
-          }
-          assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset,
-              numChildRows, cv.getNestedChildren().get(0), colName + " list child",
-              enableNullCountCheck, enableNullabilityCheck);
-          break;
-        case STRUCT:
-          List<HostColumnVectorCore> expectedChildren = expected.getNestedChildren();
-          List<HostColumnVectorCore> cvChildren = cv.getNestedChildren();
-          for (int i = 0; i < expectedChildren.size(); i++) {
-            HostColumnVectorCore expectedChild = expectedChildren.get(i);
-            HostColumnVectorCore cvChild = cvChildren.get(i);
-            String childName = colName + " child " + i;
-            assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName);
-            assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild,
-                colName, enableNullCountCheck, enableNullabilityCheck);
-          }
-          break;
-        default:
-          throw new IllegalArgumentException(type + " is not supported yet");
-      }
-    }
-  }
-
-  /**
-   * Checks and asserts that the two tables from a given rowindex match based on a provided schema.
-   * @param expected the expected result table
-   * @param rowOffset the row number to start checking from
-   * @param length the number of rows to check
-   * @param table the input table to compare against expected
-   * @param enableNullCheck whether to check for nulls or not
-   * @param enableNullabilityCheck whether the table have a validity mask
-   */
-  public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table,
-                                                 boolean enableNullCheck, boolean enableNullabilityCheck) {
-    assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns());
-    assertEquals(length, table.getRowCount(), "ROW COUNT");
-    for (int col = 0; col < expected.getNumberOfColumns(); col++) {
-      ColumnVector expect = expected.getColumn(col);
-      ColumnVector cv = table.getColumn(col);
-      String name = String.valueOf(col);
-      if (rowOffset != 0 || length != expected.getRowCount()) {
-        name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1);
-      }
-      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck);
-    }
-  }
-
-  /**
-   * Checks and asserts that the two tables match
-   * @param expected the expected result table
-   * @param table the input table to compare against expected
-   */
-  public static void assertTablesAreEqual(Table expected, Table table) {
-    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false);
-  }
-
   void assertTablesHaveSameValues(HashMap<Object, Integer>[] expectedTable, Table table) {
     assertEquals(expectedTable.length, table.getNumberOfColumns());
     int numCols = table.getNumberOfColumns();
@@ -350,16 +127,6 @@ void assertTablesHaveSameValues(HashMap<Object, Integer>[] expectedTable, Table
     }
   }
 
-  public static void assertTableTypes(DType[] expectedTypes, Table t) {
-    int len = t.getNumberOfColumns();
-    assertEquals(expectedTypes.length, len);
-    for (int i = 0; i < len; i++) {
-      ColumnVector vec = t.getColumn(i);
-      DType type = vec.getType();
-      assertEquals(expectedTypes[i], type, "Types don't match at " + i);
-    }
-  }
-
   @Test
   void testMergeSimple() {
     try (Table table1 = new Table.TestBuilder()
@@ -540,7 +307,7 @@ void testReadCSVPrune() {
         .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
         .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.2, 119.8)
         .build();
-         Table table = Table.readCSV(schema, opts, new File("./src/test/resources/simple.csv"))) {
+         Table table = Table.readCSV(schema, opts, TEST_SIMPLE_CSV_FILE)) {
       assertTablesAreEqual(expected, table);
     }
   }
@@ -667,7 +434,7 @@ void testReadCSV() {
         .column(120L, 121L, 122L, 123L, 124L, 125L, 126L, 127L, 128L, 129L)
         .column("one", "two", "three", "four", "five", "six", "seven\ud801\uddb8", "eight\uBF68", "nine\u03E8", "ten")
         .build();
-         Table table = Table.readCSV(schema, new File("./src/test/resources/simple.csv"))) {
+         Table table = Table.readCSV(schema, TEST_SIMPLE_CSV_FILE)) {
       assertTablesAreEqual(expected, table);
     }
   }
@@ -3659,6 +3426,97 @@ void testMergeApproxPercentile2() {
     }
   }
 
+  @Test
+  void testGroupByMinMaxDecimal() {
+    try (Table t1 = new Table.TestBuilder()
+        .column( "1",  "1", "1", "1", "2")
+        .column(0, 1, 3 , 3, 4)
+        .decimal128Column(-4, RoundingMode.HALF_UP,
+            new BigInteger("123456789123456789"),
+            new BigInteger("7979879879879798"),
+            new BigInteger("17979879879879798"),
+            new BigInteger("2234563472398472398"),
+            null)
+        .build()) {
+      try (Table result = t1
+          .groupBy(GroupByOptions.builder()
+              .withKeysSorted(true)
+              .withKeysDescending(false, false)
+              .build(), 0, 1)
+          .scan(GroupByScanAggregation.min().onColumn(2),
+              GroupByScanAggregation.max().onColumn(2));
+           Table expected = new Table.TestBuilder()
+               .column( "1",  "1", "1", "1", "2")
+               .column(0, 1, 3, 3, 4)
+               .decimal128Column(-4, RoundingMode.HALF_UP,
+                   new BigInteger("123456789123456789"),
+                   new BigInteger("7979879879879798"),
+                   new BigInteger("17979879879879798"),
+                   new BigInteger("17979879879879798"),
+                   null)
+               .decimal128Column(-4, RoundingMode.HALF_UP,
+                   new BigInteger("123456789123456789"),
+                   new BigInteger("7979879879879798"),
+                   new BigInteger("17979879879879798"),
+                   new BigInteger("2234563472398472398"),
+                   null)
+               .build()) {
+        assertTablesAreEqual(expected, result);
+      }
+    }
+  }
+
+  @Test
+  void testGroupByMinMaxDecimalAgg() {
+    try (Table t1 = new Table.TestBuilder()
+        .column(-341142443, 48424546)
+        .decimal128Column(-2, RoundingMode.HALF_DOWN,
+            new BigInteger("2978603952268112009"),
+            new BigInteger("571526248386900094"))
+        .build()) {
+      try (Table result = t1
+          .groupBy(GroupByOptions.builder()
+              .build(), 0)
+          .aggregate(GroupByAggregation.max().onColumn(1));
+           Table expected = new Table.TestBuilder()
+               .column(-341142443, 48424546)
+               .decimal128Column(-2, RoundingMode.HALF_DOWN,
+                   new BigInteger("2978603952268112009"),
+                   new BigInteger("571526248386900094"))
+               .build()) {
+        assertTablesAreEqual(expected, result);
+      }
+    }
+  }
+
+  @Test
+  void testGroupByCountDecimal() {
+    try (Table t1 = new Table.TestBuilder()
+        .column( "1",  "1", "1", "1", "2")
+        .column(0, 1, 3 , 3, 4)
+        .decimal128Column(-4, RoundingMode.HALF_UP,
+            new BigInteger("123456789123456789"),
+            new BigInteger("7979879879879798"),
+            new BigInteger("17979879879879798"),
+            new BigInteger("2234563472398472398"),
+            null)
+        .build()) {
+      try (Table result = t1
+          .groupBy(GroupByOptions.builder()
+              .withKeysSorted(true)
+              .withKeysDescending(false, false)
+              .build(), 0, 1)
+          .aggregate(GroupByAggregation.count().onColumn(2));
+           Table expected = new Table.TestBuilder()
+               .column( "1",  "1", "1", "2")
+               .column(0, 1, 3, 4)
+               .column(1, 1, 2, 0)
+               .build()) {
+        assertTablesAreEqual(expected, result);
+      }
+    }
+  }
+
   @Test
   void testGroupByUniqueCount() {
     try (Table t1 = new Table.TestBuilder()
@@ -3680,6 +3538,33 @@ void testGroupByUniqueCount() {
     }
   }
 
+  @Test
+  void testOrderByDecimal() {
+    try (Table t1 = new Table.TestBuilder()
+        .column( "1",  "1", "1", "1")
+        .column(0, 1, 3 , 3)
+        .decimal64Column(4,
+            123456L,
+            124567L,
+            125678L,
+            126789L)
+        .build()) {
+      try (Table sorted = t1.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
+           Table expected = new Table.TestBuilder()
+               .column( "1",  "1", "1", "1")
+               .column(   0,    1, 3, 3)
+               .decimal64Column(4,
+                   123456L,
+                   124567L,
+                   125678L,
+                   126789L)
+               .build()) {
+        assertTablesAreEqual(expected, sorted);
+
+      }
+    }
+  }
+
   @Test
   void testGroupByUniqueCountNulls() {
     try (Table t1 = new Table.TestBuilder()
@@ -6707,43 +6592,202 @@ void testTableBasedFilter() {
     }
   }
 
-  private Table getExpectedFileTable() {
-    return getExpectedFileTable(false, false);
+  private enum Columns {
+    BOOL("BOOL"),
+    INT("INT"),
+    BYTE("BYTE"),
+    LONG("LONG"),
+    STRING("STRING"),
+    FLOAT("FLOAT"),
+    DOUBLE("DOUBLE"),
+    DECIMAL64("DECIMAL64"),
+    DECIMAL128("DECIMAL128"),
+    STRUCT("STRUCT"),
+    STRUCT_DEC128("STRUCT_DEC128"),
+    LIST("LIST"),
+    LIST_STRUCT("LIST_STRUCT"),
+    LIST_DEC128("LIST_DEC128");
+
+    final String name;
+
+    Columns(String columnName) {
+      this.name = columnName;
+    }
+  }
+
+  private static class WriteUtils {
+
+    private static final Map<Columns, Function<TestBuilder, TestBuilder>> addColumnFn = Maps.newHashMap();
+
+    static {
+      addColumnFn.put(Columns.BOOL, (t) -> t.column(true, false, false, true, false));
+      addColumnFn.put(Columns.INT, (t) -> t.column(5, 1, 0, 2, 7));
+      addColumnFn.put(Columns.LONG, (t) -> t.column(3l, 9l, 4l, 2l, 20l));
+      addColumnFn.put(Columns.BYTE, (t) -> t.column(new Byte[]{2, 3, 4, 5, 9}));
+      addColumnFn.put(Columns.STRING, (t) -> t.column("this", "is", "a", "test", "string"));
+      addColumnFn.put(Columns.FLOAT, (t) -> t.column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f));
+      addColumnFn.put(Columns.DOUBLE, (t) -> t.column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d));
+      addColumnFn.put(Columns.DECIMAL64, (t) ->
+          t.decimal64Column(-5, 1L, 323L, 12398423L, -231312412L, 239893414231L));
+      addColumnFn.put(Columns.DECIMAL128, (t) ->
+          t.decimal128Column(-10, RoundingMode.UNNECESSARY, BigInteger.ONE, BigInteger.ZERO,
+              BigInteger.TEN, new BigInteger("100000000000000000000000000000"),
+              new BigInteger("-1234567890123456789012345678")));
+
+      BasicType dec64Type = new BasicType(true, DType.create(DType.DTypeEnum.DECIMAL64, 0));
+      StructType structType = new StructType(true,
+          new BasicType(true, DType.INT32), new BasicType(true, DType.STRING), dec64Type);
+      addColumnFn.put(Columns.STRUCT, (t) -> t.column(structType,
+          struct(1, "k1", BigDecimal.ONE),
+          struct(2, "k2", BigDecimal.ZERO),
+          struct(3, "k3", BigDecimal.TEN),
+          struct(4, "k4", BigDecimal.valueOf(Long.MAX_VALUE)),
+          new HostColumnVector.StructData((List) null)));
+      BasicType dec128Type = new BasicType(true, DType.create(DType.DTypeEnum.DECIMAL128, -5));
+      addColumnFn.put(Columns.STRUCT_DEC128, (t) ->
+          t.column(new StructType(false, dec128Type),
+              struct(BigDecimal.valueOf(Integer.MAX_VALUE, 5)),
+              struct(BigDecimal.valueOf(Long.MAX_VALUE, 5)),
+              struct(new BigDecimal("111111111122222222223333333333").setScale(5)),
+              struct(new BigDecimal("123456789123456789123456789").setScale(5)),
+              struct((BigDecimal) null)));
+
+      addColumnFn.put(Columns.LIST, (t) ->
+          t.column(new ListType(false, new BasicType(false, DType.INT32)),
+              Arrays.asList(1, 2),
+              Arrays.asList(3, 4),
+              Arrays.asList(5),
+              Arrays.asList(6, 7),
+              Arrays.asList(8, 9, 10)));
+      addColumnFn.put(Columns.LIST_STRUCT, (t) ->
+          t.column(new ListType(true, structType),
+              Arrays.asList(struct(1, "k1", BigDecimal.ONE), struct(2, "k2", BigDecimal.ONE),
+                  struct(3, "k3", BigDecimal.ONE)),
+              Arrays.asList(struct(4, "k4", BigDecimal.ONE), struct(5, "k5", BigDecimal.ONE)),
+              Arrays.asList(struct(6, "k6", BigDecimal.ONE)),
+              Arrays.asList(new HostColumnVector.StructData((List) null)),
+              (List) null));
+      addColumnFn.put(Columns.LIST_DEC128, (t) ->
+          t.column(new ListType(true, new StructType(false, dec128Type)),
+              Arrays.asList(struct(BigDecimal.valueOf(Integer.MAX_VALUE, 5)),
+                  struct(BigDecimal.valueOf(Integer.MIN_VALUE, 5))),
+              Arrays.asList(struct(BigDecimal.valueOf(Long.MAX_VALUE, 5)),
+                  struct(BigDecimal.valueOf(0, 5)), struct(BigDecimal.valueOf(-1, 5))),
+              Arrays.asList(struct(new BigDecimal("111111111122222222223333333333").setScale(5))),
+              Arrays.asList(struct(new BigDecimal("123456789123456789123456789").setScale(5))),
+              Arrays.asList(struct((BigDecimal) null))));
+    }
+
+    static TestBuilder addColumn(TestBuilder tb, String colName) {
+      if (!addColumnFn.containsKey(Columns.valueOf(colName))) {
+        throw new IllegalArgumentException("Unknown column name: " + colName);
+      }
+      return addColumnFn.get(Columns.valueOf(colName)).apply(tb);
+    }
+
+    static String[] getAllColumns(boolean withDecimal128) {
+      List<String> columns = Lists.newArrayList(
+          Columns.BOOL.name, Columns.INT.name, Columns.BYTE.name, Columns.LONG.name,
+          Columns.STRING.name, Columns.FLOAT.name, Columns.DOUBLE.name, Columns.DECIMAL64.name,
+          Columns.STRUCT.name, Columns.LIST.name, Columns.LIST_STRUCT.name);
+      if (withDecimal128) {
+        columns.add(Columns.DECIMAL128.name);
+        columns.add(Columns.STRUCT_DEC128.name);
+        columns.add(Columns.LIST_DEC128.name);
+      }
+      String[] ret = new String[columns.size()];
+      columns.toArray(ret);
+      return ret;
+    }
+
+    static String[] getNonNestedColumns(boolean withDecimal128) {
+      List<String> columns = Lists.newArrayList(
+          Columns.BOOL.name, Columns.INT.name, Columns.BYTE.name, Columns.LONG.name,
+          Columns.STRING.name, Columns.FLOAT.name, Columns.DOUBLE.name, Columns.DECIMAL64.name);
+      if (withDecimal128) {
+        columns.add(Columns.DECIMAL128.name);
+      }
+      String[] ret = new String[columns.size()];
+      columns.toArray(ret);
+      return ret;
+    }
+
+    static void buildWriterOptions(ColumnWriterOptions.NestedBuilder builder, List<String> columns) {
+      for (String colName : columns) {
+        buildWriterOptions(builder, colName);
+      }
+    }
+
+    static void buildWriterOptions(ColumnWriterOptions.NestedBuilder builder, String... columns) {
+      for (String colName : columns) {
+        buildWriterOptions(builder, colName);
+      }
+    }
+
+    static void buildWriterOptions(ColumnWriterOptions.NestedBuilder builder, String colName) {
+      switch (Columns.valueOf(colName)) {
+      case BOOL:
+      case INT:
+      case LONG:
+      case FLOAT:
+      case DOUBLE:
+      case BYTE:
+      case STRING:
+        builder.withColumns(false, colName);
+        break;
+      case DECIMAL64:
+        builder.withDecimalColumn(colName, DType.DECIMAL64_MAX_PRECISION);
+        break;
+      case DECIMAL128:
+        builder.withDecimalColumn(colName, DType.DECIMAL128_MAX_PRECISION);
+        break;
+      case STRUCT:
+        builder.withStructColumn(structBuilder(colName)
+            .withNullableColumns("ch_int")
+            .withNullableColumns("ch_str")
+            .withDecimalColumn("ch_dec64", DType.DECIMAL64_MAX_PRECISION, true)
+            .build());
+        break;
+      case LIST:
+        builder.withListColumn(listBuilder(colName, false)
+            .withNonNullableColumns("ch_int")
+            .build());
+        break;
+      case LIST_STRUCT:
+        builder.withListColumn(listBuilder(colName)
+            .withStructColumn(structBuilder(colName)
+                .withNullableColumns("ch_int")
+                .withNullableColumns("ch_str")
+                .withDecimalColumn("ch_dec64", DType.DECIMAL64_MAX_PRECISION, true)
+                .build())
+            .build());
+        break;
+      case STRUCT_DEC128:
+        builder.withStructColumn(structBuilder(colName, false)
+            .withDecimalColumn("ch_dec128", DType.DECIMAL128_MAX_PRECISION, true)
+            .build());
+        break;
+      case LIST_DEC128:
+        builder.withListColumn(listBuilder(colName)
+            .withStructColumn(structBuilder(colName, false)
+                .withDecimalColumn("ch_dec128", DType.DECIMAL128_MAX_PRECISION, true)
+                .build())
+            .build());
+        break;
+      default:
+        throw new IllegalArgumentException("should NOT reach here");
+      }
+    }
   }
 
-  private Table getExpectedFileTable(boolean withNestedColumns) {
-    return getExpectedFileTable(true, true);
+  private Table getExpectedFileTable(String... selectColumns) {
+    return getExpectedFileTable(Lists.newArrayList(selectColumns));
   }
 
-  private Table getExpectedFileTable(boolean withStructColumns, boolean withListColumn) {
-    TestBuilder tb = new TestBuilder()
-        .column(true, false, false, true, false)
-        .column(5, 1, 0, 2, 7)
-        .column(new Byte[]{2, 3, 4, 5, 9})
-        .column(3l, 9l, 4l, 2l, 20l)
-        .column("this", "is", "a", "test", "string")
-        .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f)
-        .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d);
-    StructType nestedType = new StructType(true,
-        new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
-    if (withStructColumns) {
-      tb.column(nestedType,
-          struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
-          struct(4, "k4"), new HostColumnVector.StructData((List) null));
-    }
-    if (withListColumn) {
-      tb.column(new ListType(false, new BasicType(false, DType.INT32)),
-          Arrays.asList(1, 2),
-          Arrays.asList(3, 4),
-          Arrays.asList(5),
-          Arrays.asList(6, 7),
-          Arrays.asList(8, 9, 10))
-          .column(new ListType(false, nestedType),
-              Arrays.asList(struct(1, "k1"), struct(2, "k2"), struct(3, "k3")),
-              Arrays.asList(struct(4, "k4"), struct(5, "k5")),
-              Arrays.asList(struct(6, "k6")),
-              Arrays.asList(new HostColumnVector.StructData((List) null)),
-              Arrays.asList());
+  private Table getExpectedFileTable(List<String> selectColumns) {
+    TestBuilder tb = new TestBuilder();
+    for (String c : selectColumns) {
+      WriteUtils.addColumn(tb, c);
     }
     return tb.build();
   }
@@ -6865,21 +6909,10 @@ void testParquetWriteMap() throws IOException {
 
   @Test
   void testParquetWriteToBufferChunkedWithNested() {
-    ParquetWriterOptions options = ParquetWriterOptions.builder()
-        .withNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6")
-        .withStructColumn(structBuilder("_c7")
-            .withNullableColumns("_c7-1")
-            .withNullableColumns("_c7-2")
-            .build())
-      .withListColumn(listBuilder("_c8")
-            .withNullableColumns("c8-1").build())
-        .withListColumn(listBuilder("c9")
-            .withStructColumn(structBuilder("c9-1")
-                .withNullableColumns("c9-1-1")
-                .withNullableColumns("c9-1-2").build())
-            .build())
-        .build();
-    try (Table table0 = getExpectedFileTable(true);
+    ParquetWriterOptions.Builder optBuilder = ParquetWriterOptions.builder();
+    WriteUtils.buildWriterOptions(optBuilder, WriteUtils.getAllColumns(false));
+    ParquetWriterOptions options = optBuilder.build();
+    try (Table table0 = getExpectedFileTable(WriteUtils.getAllColumns(false));
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
         writer.write(table0);
@@ -6896,20 +6929,18 @@ void testParquetWriteToBufferChunkedWithNested() {
 
   @Test
   void testParquetWriteToBufferChunked() {
-    ParquetWriterOptions options = ParquetWriterOptions.builder()
-        .withNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6")
-        .withStructColumn(structBuilder("_c7")
-            .withNullableColumns("_c7-1")
-            .withNullableColumns("_c7-2")
-            .build())
-        .build();
-    try (Table table0 = getExpectedFileTable(true, false);
+    ParquetWriterOptions.Builder optBuilder = ParquetWriterOptions.builder();
+    List<String> columns = Lists.newArrayList(WriteUtils.getNonNestedColumns(false));
+    columns.add(Columns.STRUCT.name);
+    WriteUtils.buildWriterOptions(optBuilder, columns);
+    ParquetWriterOptions options = optBuilder.build();
+    try (Table table0 = getExpectedFileTable(columns);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
-         try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
-           writer.write(table0);
-           writer.write(table0);
-           writer.write(table0);
-         }
+      try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
+        writer.write(table0);
+        writer.write(table0);
+        writer.write(table0);
+      }
       try (Table table1 = Table.readParquet(ParquetOptions.DEFAULT, consumer.buffer, 0, consumer.offset);
            Table concat = Table.concatenate(table0, table0, table0)) {
         assertTablesAreEqual(concat, table1);
@@ -6987,9 +7018,10 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException {
   @Test
   void testArrowIPCWriteToFileWithNamesAndMetadata() throws IOException {
     File tempFile = File.createTempFile("test-names-metadata", ".arrow");
-    try (Table table0 = getExpectedFileTable()) {
+    String[] columnNames = WriteUtils.getNonNestedColumns(false);
+    try (Table table0 = getExpectedFileTable(columnNames)) {
       ArrowIPCWriterOptions options = ArrowIPCWriterOptions.builder()
-              .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh")
+              .withColumnNames(columnNames)
               .build();
       try (TableWriter writer = Table.writeArrowIPCChunked(options, tempFile.getAbsoluteFile())) {
         writer.write(table0);
@@ -7016,13 +7048,18 @@ void testArrowIPCWriteToFileWithNamesAndMetadata() throws IOException {
 
   @Test
   void testArrowIPCWriteToBufferChunked() {
-    try (Table table0 = getExpectedFileTable(true);
+    String[] nonNestedCols = WriteUtils.getNonNestedColumns(false);
+    List<String> columns = Lists.newArrayList(nonNestedCols);
+    columns.add(Columns.STRUCT.name);
+    columns.add(Columns.LIST.name);
+    columns.add(Columns.LIST_STRUCT.name);
+    try (Table table0 = getExpectedFileTable(columns);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       ArrowIPCWriterOptions options = ArrowIPCWriterOptions.builder()
-              .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh")
-              .withColumnNames("eighth", "eighth_id", "eighth_name")
-              .withColumnNames("ninth")
-              .withColumnNames("tenth", "child_id", "child_name")
+              .withColumnNames(nonNestedCols)
+              .withColumnNames(Columns.STRUCT.name, "int", "str", "dec64")
+              .withColumnNames(Columns.LIST.name)
+              .withColumnNames(Columns.LIST_STRUCT.name, "int", "str", "dec64")
               .build();
       try (TableWriter writer = Table.writeArrowIPCChunked(options, consumer)) {
         writer.write(table0);
@@ -7049,9 +7086,12 @@ void testArrowIPCWriteToBufferChunked() {
 
   @Test
   void testORCWriteToBufferChunked() {
-    try (Table table0 = getExpectedFileTable(true);
+    String[] selectedColumns = WriteUtils.getAllColumns(false);
+    try (Table table0 = getExpectedFileTable(selectedColumns);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
-      ORCWriterOptions opts = createORCWriterOptionsWithNested();
+      ORCWriterOptions.Builder builder = ORCWriterOptions.builder();
+      WriteUtils.buildWriterOptions(builder, selectedColumns);
+      ORCWriterOptions opts = builder.build();
       try (TableWriter writer = Table.writeORCChunked(opts, consumer)) {
         writer.write(table0);
         writer.write(table0);
@@ -7067,8 +7107,11 @@ void testORCWriteToBufferChunked() {
   @Test
   void testORCWriteToFileChunked() throws IOException {
     File tempFile = File.createTempFile("test", ".orc");
-    try (Table table0 = getExpectedFileTable(true)) {
-      ORCWriterOptions opts = createORCWriterOptionsWithNested();
+    String[] selectedColumns = WriteUtils.getAllColumns(false);
+    try (Table table0 = getExpectedFileTable(selectedColumns)) {
+      ORCWriterOptions.Builder builder = ORCWriterOptions.builder();
+      WriteUtils.buildWriterOptions(builder, selectedColumns);
+      ORCWriterOptions opts = builder.build();
       try (TableWriter writer = Table.writeORCChunked(opts, tempFile.getAbsoluteFile())) {
         writer.write(table0);
       }
@@ -7111,7 +7154,7 @@ void testORCWriteMapChunked() throws IOException {
   @Test
   void testORCWriteToFile() throws IOException {
     File tempFile = File.createTempFile("test", ".orc");
-    try (Table table0 = getExpectedFileTable()) {
+    try (Table table0 = getExpectedFileTable(WriteUtils.getNonNestedColumns(false))) {
       table0.writeORC(tempFile.getAbsoluteFile());
       try (Table table1 = Table.readORC(tempFile.getAbsoluteFile())) {
         assertTablesAreEqual(table0, table1);
@@ -7124,12 +7167,11 @@ void testORCWriteToFile() throws IOException {
   @Test
   void testORCWriteToFileWithColNames() throws IOException {
     File tempFile = File.createTempFile("test", ".orc");
-    final String[] colNames = new String[]{"bool", "int", "byte","long","str","float","double"};
-    try (Table table0 = getExpectedFileTable()) {
-      ORCWriterOptions options = ORCWriterOptions.builder()
-          .withColumns(true, colNames)
-          .withMetadata("somekey", "somevalue")
-          .build();
+    String[] colNames = WriteUtils.getNonNestedColumns(false);
+    try (Table table0 = getExpectedFileTable(colNames)) {
+      ORCWriterOptions.Builder optBuilder = ORCWriterOptions.builder();
+      WriteUtils.buildWriterOptions(optBuilder, colNames);
+      ORCWriterOptions options = optBuilder.build();
       table0.writeORC(options, tempFile.getAbsoluteFile());
       ORCOptions opts = ORCOptions.builder().includeColumn(colNames).build();
       try (Table table1 = Table.readORC(opts, tempFile.getAbsoluteFile())) {
@@ -7140,10 +7182,34 @@ void testORCWriteToFileWithColNames() throws IOException {
     }
   }
 
+  @Test
+  void testORCReadAndWriteForDecimal128() throws IOException {
+    File tempFile = File.createTempFile("test", ".orc");
+    String[] colNames = new String[]{Columns.DECIMAL64.name,
+        Columns.DECIMAL128.name, Columns.STRUCT_DEC128.name, Columns.LIST_DEC128.name};
+    try (Table table0 = getExpectedFileTable(colNames)) {
+      ORCWriterOptions.Builder optBuilder = ORCWriterOptions.builder();
+      WriteUtils.buildWriterOptions(optBuilder, colNames);
+      ORCWriterOptions options = optBuilder.build();
+      table0.writeORC(options, tempFile.getAbsoluteFile());
+      ORCOptions opts = ORCOptions.builder()
+          .includeColumn(colNames)
+          .decimal128Column(Columns.DECIMAL128.name,
+              String.format("%s.%s", Columns.STRUCT_DEC128.name, "ch_dec128"),
+              String.format("%s.1.%s", Columns.LIST_DEC128.name, "ch_dec128"))
+          .build();
+      try (Table table1 = Table.readORC(opts, tempFile.getAbsoluteFile())) {
+        assertTablesAreEqual(table0, table1);
+      }
+    } finally {
+      tempFile.delete();
+    }
+  }
+
   @Test
   void testORCWriteToFileUncompressed() throws IOException {
     File tempFileUncompressed = File.createTempFile("test-uncompressed", ".orc");
-    try (Table table0 = getExpectedFileTable()) {
+    try (Table table0 = getExpectedFileTable(WriteUtils.getNonNestedColumns(false))) {
       String[] colNames = new String[table0.getNumberOfColumns()];
       Arrays.fill(colNames, "");
       ORCWriterOptions opts = ORCWriterOptions.builder()
@@ -7249,27 +7315,7 @@ void fixedWidthRowsRoundTrip() {
 
   // utility methods to reduce typing
 
-  private ORCWriterOptions createORCWriterOptionsWithNested() {
-    // The column metadata should match the table returned from
-    // 'getExpectedFileTable(true)'.
-    return ORCWriterOptions.builder()
-        .withNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6")
-        .withStructColumn(structBuilder("_c7")
-            .withNullableColumns("_c7-1")
-            .withNullableColumns("_c7-2")
-            .build())
-        .withListColumn(listBuilder("_c8")
-            .withNullableColumns("_c8-1").build())
-        .withListColumn(listBuilder("_c9")
-            .withStructColumn(structBuilder("_c9-1")
-                .withNullableColumns("_c9-1-1")
-                .withNullableColumns("_c9-1-2")
-                .build())
-            .build())
-        .build();
-  }
-
-  private StructData struct(Object... values) {
+  private static StructData struct(Object... values) {
     return new StructData(values);
   }
 
@@ -7538,4 +7584,25 @@ void testExplodeOuterPosition() {
       }
     }
   }
+
+  @Test
+  void testSample() {
+    try (Table t = new Table.TestBuilder().column("s1", "s2", "s3", "s4", "s5").build()) {
+      try (Table ret = t.sample(3, false, 0);
+           Table expected = new Table.TestBuilder().column("s3", "s4", "s5").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+
+      try (Table ret = t.sample(5, false, 0);
+           Table expected = new Table.TestBuilder().column("s3", "s4", "s5", "s2", "s1").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+
+      try (Table ret = t.sample(8, true, 0);
+           Table expected = new Table.TestBuilder()
+               .column("s1", "s1", "s4", "s5", "s5", "s1", "s3", "s2").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+    }
+  }
 }
diff --git a/java/src/test/java/ai/rapids/cudf/TestUtils.java b/java/src/test/java/ai/rapids/cudf/TestUtils.java
index 5a799c666c2..a1acab5883b 100644
--- a/java/src/test/java/ai/rapids/cudf/TestUtils.java
+++ b/java/src/test/java/ai/rapids/cudf/TestUtils.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -18,6 +18,9 @@
 
 package ai.rapids.cudf;
 
+import java.io.File;
+import java.net.URISyntaxException;
+import java.net.URL;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
@@ -211,4 +214,16 @@ static Double[] getDoubles(final long seed, final int size, int specialValues) {
     });
     return result;
   }
+
+  public static File getResourceAsFile(String resourceName) {
+    URL url = TestUtils.class.getClassLoader().getResource(resourceName);
+    if (url == null) {
+      throw new IllegalArgumentException("Unable to locate resource: " + resourceName);
+    }
+    try {
+      return new File(url.toURI());
+    } catch (URISyntaxException e) {
+      throw new RuntimeException(e);
+    }
+  }
 }
diff --git a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
index 8bf1370a0f7..9a929cec98d 100644
--- a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
@@ -22,7 +22,7 @@
 
 import java.util.function.Function;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
 public class TimestampColumnVectorTest extends CudfTestBase {
diff --git a/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java b/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java
index 76970e8bf76..7fcb7cbd85b 100644
--- a/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java
+++ b/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java
@@ -22,7 +22,7 @@
 import ai.rapids.cudf.HostColumnVector.Builder;
 import org.junit.jupiter.api.Test;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 
 public class UnaryOpTest extends CudfTestBase {
   private static final Double[] DOUBLES_1 = new Double[]{1.0, 10.0, -100.1, 5.3, 50.0, 100.0, null, Double.NaN, Double.POSITIVE_INFINITY, 1/9.0, Double.NEGATIVE_INFINITY, 500.0, -500.0};
diff --git a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
index 2fb6792b409..e50da0a4d4d 100644
--- a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
@@ -36,7 +36,7 @@
 import java.util.function.Function;
 import java.util.stream.Stream;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 
 public class CompiledExpressionTest extends CudfTestBase {
   @Test
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index bc35551b5bd..2461e7b09bc 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -42,7 +42,7 @@
     UInt64Index,
     interval_range,
 )
-from cudf.core.dataframe import DataFrame, from_pandas, merge
+from cudf.core.dataframe import DataFrame, from_pandas, merge, from_dataframe
 from cudf.core.series import Series
 from cudf.core.multiindex import MultiIndex
 from cudf.core.cut import cut
@@ -102,6 +102,17 @@
 from cudf.utils.dtypes import _NA_REP
 from cudf.utils.utils import set_allocator
 
+try:
+    from ptxcompiler.patch import patch_numba_codegen_if_needed
+except ImportError:
+    pass
+else:
+    # Patch Numba to support CUDA enhanced compatibility.
+    # See https://github.com/rapidsai/ptxcompiler for
+    # details.
+    patch_numba_codegen_if_needed()
+    del patch_numba_codegen_if_needed
+
 cuda.set_memory_manager(rmm.RMMNumbaManager)
 cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
 
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 4f703724cef..68f7101b6ee 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -1,6 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
-from enum import Enum
+from enum import Enum, IntEnum
 
 import numba
 import numpy as np
@@ -30,6 +30,7 @@ from cudf._lib.types import Interpolation
 
 cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 cimport cudf._lib.cpp.types as libcudf_types
+from cudf._lib.cpp.aggregation cimport underlying_type_t_correlation_type
 
 import cudf
 
@@ -57,6 +58,22 @@ class AggregationKind(Enum):
     UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET
     PTX = libcudf_aggregation.aggregation.Kind.PTX
     CUDA = libcudf_aggregation.aggregation.Kind.CUDA
+    CORRELATION = libcudf_aggregation.aggregation.Kind.CORRELATION
+
+
+class CorrelationType(IntEnum):
+    PEARSON = (
+        <underlying_type_t_correlation_type>
+        libcudf_aggregation.correlation_type.PEARSON
+    )
+    KENDALL = (
+        <underlying_type_t_correlation_type>
+        libcudf_aggregation.correlation_type.KENDALL
+    )
+    SPEARMAN = (
+        <underlying_type_t_correlation_type>
+        libcudf_aggregation.correlation_type.SPEARMAN
+    )
 
 
 cdef class Aggregation:
@@ -321,6 +338,22 @@ cdef class Aggregation:
             ))
         return agg
 
+    @classmethod
+    def corr(cls, method, libcudf_types.size_type min_periods):
+        cdef Aggregation agg = cls()
+        cdef libcudf_aggregation.correlation_type c_method = (
+            <libcudf_aggregation.correlation_type> (
+                <underlying_type_t_correlation_type> (
+                    CorrelationType[method.upper()]
+                )
+            )
+        )
+        agg.c_obj = move(
+            libcudf_aggregation.make_correlation_aggregation[aggregation](
+                c_method, min_periods
+            ))
+        return agg
+
 cdef class RollingAggregation:
     """A Cython wrapper for rolling window aggregations.
 
@@ -692,6 +725,24 @@ cdef class GroupbyAggregation:
         )
         return agg
 
+    @classmethod
+    def corr(cls, method, libcudf_types.size_type min_periods):
+        cdef GroupbyAggregation agg = cls()
+        cdef libcudf_aggregation.correlation_type c_method = (
+            <libcudf_aggregation.correlation_type> (
+                <underlying_type_t_correlation_type> (
+                    CorrelationType[method.upper()]
+                )
+            )
+        )
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_correlation_aggregation[groupby_aggregation](
+                c_method, min_periods
+            ))
+        return agg
+
+
 cdef class GroupbyScanAggregation:
     """A Cython wrapper for groupby scan aggregations.
 
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 26ef428f21f..28bd78733a3 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -37,7 +37,12 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
-from cudf._lib.utils cimport data_from_table_view, data_from_unique_ptr
+from cudf._lib.utils cimport (
+    columns_from_unique_ptr,
+    data_from_table_view,
+    data_from_unique_ptr,
+    table_view_from_columns,
+)
 
 # workaround for https://github.com/cython/cython/issues/3885
 ctypedef const scalar constscalar
@@ -144,27 +149,12 @@ def copy_range(Column input_column,
 
 
 def gather(
-    source_table,
+    columns: list,
     Column gather_map,
-    bool keep_index=True,
-    bool nullify=False,
-    bool check_bounds=True
+    bool nullify=False
 ):
-    if not pd.api.types.is_integer_dtype(gather_map.dtype):
-        raise ValueError("Gather map is not integer dtype.")
-
-    if check_bounds and len(gather_map) > 0 and not nullify:
-        gm_min, gm_max = minmax(gather_map)
-        if gm_min < -len(source_table) or gm_max >= len(source_table):
-            raise IndexError(f"Gather map index with min {gm_min},"
-                             f" max {gm_max} is out of bounds in"
-                             f" {type(source_table)} with {len(source_table)}"
-                             f" rows.")
-
     cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, not keep_index
-    )
+    cdef table_view source_table_view = table_view_from_columns(columns)
     cdef column_view gather_map_view = gather_map.view()
     cdef cpp_copying.out_of_bounds_policy policy = (
         cpp_copying.out_of_bounds_policy.NULLIFY if nullify
@@ -180,16 +170,7 @@ def gather(
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=(
-            None if (
-                source_table._index is None)
-            or keep_index is False
-            else source_table._index_names
-        )
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def scatter(object source, Column scatter_map, Column target_column,
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd
index 13bfa49057c..3982b4fecbb 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd
@@ -1,5 +1,5 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
@@ -11,6 +11,7 @@ from cudf._lib.cpp.types cimport (
     size_type,
 )
 
+ctypedef int32_t underlying_type_t_correlation_type
 
 cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
 
@@ -38,6 +39,8 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
             COLLECT_SET 'cudf::aggregation::COLLECT_SET'
             PTX 'cudf::aggregation::PTX'
             CUDA 'cudf::aggregation::CUDA'
+            CORRELATION 'cudf::aggregation::CORRELATION'
+
         Kind kind
 
     cdef cppclass rolling_aggregation:
@@ -53,6 +56,11 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         CUDA 'cudf::udf_type::CUDA'
         PTX 'cudf::udf_type::PTX'
 
+    ctypedef enum correlation_type:
+        PEARSON 'cudf::correlation_type::PEARSON'
+        KENDALL 'cudf::correlation_type::KENDALL'
+        SPEARMAN 'cudf::correlation_type::SPEARMAN'
+
     cdef unique_ptr[T] make_sum_aggregation[T]() except +
 
     cdef unique_ptr[T] make_product_aggregation[T]() except +
@@ -106,3 +114,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         udf_type type,
         string user_defined_aggregator,
         data_type output_type) except +
+
+    cdef unique_ptr[T] make_correlation_aggregation[T](
+        correlation_type type, size_type min_periods) except +
diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd
index 2af4dd648c5..38ed9fbd769 100644
--- a/python/cudf/cudf/_lib/cpp/datetime.pxd
+++ b/python/cudf/cudf/_lib/cpp/datetime.pxd
@@ -23,7 +23,22 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
     cdef unique_ptr[column] ceil_microsecond(
         const column_view& column
     ) except +
-    cdef unique_ptr[column] ceil_nanosecond(const column_view& column) except +
+    cdef unique_ptr[column] ceil_nanosecond(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] floor_day(const column_view& column) except +
+    cdef unique_ptr[column] floor_hour(const column_view& column) except +
+    cdef unique_ptr[column] floor_minute(const column_view& column) except +
+    cdef unique_ptr[column] floor_second(const column_view& column) except +
+    cdef unique_ptr[column] floor_millisecond(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] floor_microsecond(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] floor_nanosecond(
+        const column_view& column
+    ) except +
     cdef unique_ptr[column] add_calendrical_months(
         const column_view& timestamps,
         const column_view& months
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index c855f112692..4b5ec913fb6 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -36,6 +36,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_use_np_dtypes(bool val) except+
         void set_timestamp_type(data_type type) except+
         void set_decimal_cols_as_float(vector[string] val) except+
+        void set_decimal128_columns(vector[string] val) except+
 
         @staticmethod
         orc_reader_options_builder builder(
@@ -57,6 +58,9 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& decimal_cols_as_float(
             vector[string] val
         ) except+
+        orc_reader_options_builder& decimal128_columns(
+            vector[string] val
+        ) except+
 
         orc_reader_options build() except+
 
@@ -68,10 +72,10 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_writer_options()
         cudf_io_types.sink_info get_sink() except+
         cudf_io_types.compression_type get_compression() except+
-        bool enable_statistics() except+
-        size_t stripe_size_bytes() except+
-        size_type stripe_size_rows() except+
-        size_type row_index_stride() except+
+        bool is_enabled_statistics() except+
+        size_t get_stripe_size_bytes() except+
+        size_type get_stripe_size_rows() except+
+        size_type get_row_index_stride() except+
         cudf_table_view.table_view get_table() except+
         const cudf_io_types.table_input_metadata *get_metadata() except+
 
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 81ca7e5836b..9d95dce83bc 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -74,6 +74,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_table_view.table_view get_table() except +
         const cudf_io_types.table_input_metadata get_metadata() except +
         string get_column_chunks_file_path() except+
+        size_t get_row_group_size_bytes() except+
+        size_type get_row_group_size_rows() except+
 
         void set_metadata(
             cudf_io_types.table_input_metadata *m
@@ -87,6 +89,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_column_chunks_file_path(
             string column_chunks_file_path
         ) except +
+        void set_row_group_size_bytes(size_t val) except+
+        void set_row_group_size_rows(size_type val) except+
 
         @staticmethod
         parquet_writer_options_builder builder(
@@ -116,6 +120,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& int96_timestamps(
             bool enabled
         ) except +
+        parquet_writer_options_builder& row_group_size_bytes(
+            size_t val
+        ) except+
+        parquet_writer_options_builder& row_group_size_rows(
+            size_type val
+        ) except+
 
         parquet_writer_options build() except +
 
@@ -130,6 +140,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.statistics_freq get_stats_level() except +
         cudf_io_types.table_input_metadata* get_metadata(
         ) except+
+        size_t get_row_group_size_bytes() except+
+        size_type get_row_group_size_rows() except+
 
         void set_metadata(
             cudf_io_types.table_input_metadata *m
@@ -140,6 +152,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_compression(
             cudf_io_types.compression_type compression
         ) except +
+        void set_row_group_size_bytes(size_t val) except+
+        void set_row_group_size_rows(size_type val) except+
 
         @staticmethod
         chunked_parquet_writer_options_builder builder(
@@ -160,6 +174,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder& compression(
             cudf_io_types.compression_type compression
         ) except +
+        parquet_writer_options_builder& row_group_size_bytes(
+            size_t val
+        ) except+
+        parquet_writer_options_builder& row_group_size_rows(
+            size_type val
+        ) except+
 
         chunked_parquet_writer_options build() except +
 
@@ -173,6 +193,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             string column_chunks_file_path,
         ) except+
 
-    cdef unique_ptr[vector[uint8_t]] merge_rowgroup_metadata(
+    cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata(
         const vector[unique_ptr[vector[uint8_t]]]& metadata_list
     ) except +
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index 3153427ce3c..590a371ff52 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -34,12 +34,6 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         bool is_ptx
     ) except +
 
-    cdef unique_ptr[column] generalized_masked_op(
-        const table_view& data_view,
-        string udf,
-        data_type output_type,
-    ) except +
-
     cdef pair[unique_ptr[table], unique_ptr[column]] encode(
         table_view input
     ) except +
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 5cda06362b6..3215088c438 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -72,13 +72,13 @@ def ceil_datetime(Column col, object field):
             c_result = move(libcudf_datetime.ceil_day(col_view))
         elif field == "H":
             c_result = move(libcudf_datetime.ceil_hour(col_view))
-        elif field == "T":
+        elif field == "T" or field == "min":
             c_result = move(libcudf_datetime.ceil_minute(col_view))
         elif field == "S":
             c_result = move(libcudf_datetime.ceil_second(col_view))
-        elif field == "L":
+        elif field == "L" or field == "ms":
             c_result = move(libcudf_datetime.ceil_millisecond(col_view))
-        elif field == "U":
+        elif field == "U" or field == "us":
             c_result = move(libcudf_datetime.ceil_microsecond(col_view))
         elif field == "N":
             c_result = move(libcudf_datetime.ceil_nanosecond(col_view))
@@ -89,6 +89,33 @@ def ceil_datetime(Column col, object field):
     return result
 
 
+def floor_datetime(Column col, object field):
+    cdef unique_ptr[column] c_result
+    cdef column_view col_view = col.view()
+
+    with nogil:
+        # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html
+        if field == "D":
+            c_result = move(libcudf_datetime.floor_day(col_view))
+        elif field == "H":
+            c_result = move(libcudf_datetime.floor_hour(col_view))
+        elif field == "T" or field == "min":
+            c_result = move(libcudf_datetime.floor_minute(col_view))
+        elif field == "S":
+            c_result = move(libcudf_datetime.floor_second(col_view))
+        elif field == "L" or field == "ms":
+            c_result = move(libcudf_datetime.floor_millisecond(col_view))
+        elif field == "U" or field == "us":
+            c_result = move(libcudf_datetime.floor_microsecond(col_view))
+        elif field == "N":
+            c_result = move(libcudf_datetime.floor_nanosecond(col_view))
+        else:
+            raise ValueError(f"Invalid resolution: '{field}'")
+
+    result = Column.from_unique_ptr(move(c_result))
+    return result
+
+
 def is_leap_year(Column col):
     """Returns a boolean indicator whether the year of the date is a leap year
     """
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 0968d22d465..314542c9549 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from collections import defaultdict
 
@@ -54,7 +54,7 @@ _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"}
 _STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT",
                 "UNIQUE"}
 _LIST_AGGS = {"COLLECT"}
-_STRUCT_AGGS = set()
+_STRUCT_AGGS = {"CORRELATION"}
 _INTERVAL_AGGS = set()
 _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE",
                  "NTH", "COLLECT"}
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 71705f4d0c1..d17184685fa 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -46,7 +46,7 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.io.parquet cimport (
     chunked_parquet_writer_options,
     chunked_parquet_writer_options_builder,
-    merge_rowgroup_metadata as parquet_merge_metadata,
+    merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
     parquet_reader_options,
     parquet_writer_options,
@@ -282,7 +282,9 @@ cpdef write_parquet(
         object compression="snappy",
         object statistics="ROWGROUP",
         object metadata_file_path=None,
-        object int96_timestamps=False):
+        object int96_timestamps=False,
+        object row_group_size_bytes=None,
+        object row_group_size_rows=None):
     """
     Cython function to call into libcudf API, see `write_parquet`.
 
@@ -334,7 +336,6 @@ cpdef write_parquet(
     cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression)
     cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics)
 
-    cdef parquet_writer_options args
     cdef unique_ptr[vector[uint8_t]] out_metadata_c
     cdef string c_column_chunks_file_path
     cdef bool _int96_timestamps = int96_timestamps
@@ -342,16 +343,21 @@ cpdef write_parquet(
         c_column_chunks_file_path = str.encode(metadata_file_path)
 
     # Perform write
+    cdef parquet_writer_options args = move(
+        parquet_writer_options.builder(sink, tv)
+        .metadata(tbl_meta.get())
+        .compression(comp_type)
+        .stats_level(stat_freq)
+        .column_chunks_file_path(c_column_chunks_file_path)
+        .int96_timestamps(_int96_timestamps)
+        .build()
+    )
+    if row_group_size_bytes is not None:
+        args.set_row_group_size_bytes(row_group_size_bytes)
+    if row_group_size_rows is not None:
+        args.set_row_group_size_rows(row_group_size_rows)
+
     with nogil:
-        args = move(
-            parquet_writer_options.builder(sink, tv)
-            .metadata(tbl_meta.get())
-            .compression(comp_type)
-            .stats_level(stat_freq)
-            .column_chunks_file_path(c_column_chunks_file_path)
-            .int96_timestamps(_int96_timestamps)
-            .build()
-        )
         out_metadata_c = move(parquet_writer(args))
 
     if metadata_file_path is not None:
@@ -483,11 +489,11 @@ cdef class ParquetWriter:
 
 cpdef merge_filemetadata(object filemetadata_list):
     """
-    Cython function to call into libcudf API, see `merge_rowgroup_metadata`.
+    Cython function to call into libcudf API, see `merge_row_group_metadata`.
 
     See Also
     --------
-    cudf.io.parquet.merge_rowgroup_metadata
+    cudf.io.parquet.merge_row_group_metadata
     """
     cdef vector[unique_ptr[vector[uint8_t]]] list_c
     cdef vector[uint8_t] blob_c
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index 7167d18409e..ef47e843723 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -24,40 +24,34 @@ from cudf._lib.cpp.types cimport (
     null_policy,
     size_type,
 )
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport (
+    columns_from_unique_ptr,
+    data_from_unique_ptr,
+    table_view_from_columns,
+    table_view_from_table,
+)
 
 
-def drop_nulls(source_table, how="any", keys=None, thresh=None):
+def drop_nulls(columns: list, how="any", keys=None, thresh=None):
     """
     Drops null rows from cols depending on key columns.
 
     Parameters
     ----------
-    source_table : source table whose null rows are dropped to form new table
+    columns : list of columns
     how  : "any" or "all". If thresh is None, drops rows of cols that have any
            nulls or all nulls (respectively) in subset (default: "any")
-    keys : List of Column names. If set, then these columns are checked for
-           nulls rather than all of cols (optional)
+    keys : List of column indices. If set, then these columns are checked for
+           nulls rather than all of columns (optional)
     thresh : Minimum number of non-nulls required to keep a row (optional)
 
     Returns
     -------
-    Frame with null rows dropped
+    columns with null rows dropped
     """
 
-    num_index_columns = (
-        0 if source_table._index is None else
-        source_table._index._num_columns)
-    # shifting the index number by number of index columns
     cdef vector[size_type] cpp_keys = (
-        [
-            num_index_columns + source_table._column_names.index(name)
-            for name in keys
-        ]
-        if keys is not None
-        else range(
-            num_index_columns, num_index_columns + source_table._num_columns
-        )
+        keys if keys is not None else range(len(columns))
     )
 
     cdef size_type c_keep_threshold = cpp_keys.size()
@@ -67,7 +61,7 @@ def drop_nulls(source_table, how="any", keys=None, thresh=None):
         c_keep_threshold = 1
 
     cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_table(source_table)
+    cdef table_view source_table_view = table_view_from_columns(columns)
 
     with nogil:
         c_result = move(
@@ -78,13 +72,7 @@ def drop_nulls(source_table, how="any", keys=None, thresh=None):
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=(
-            None if source_table._index is None
-            else source_table._index_names)
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def apply_boolean_mask(source_table, Column boolean_mask):
@@ -124,26 +112,29 @@ def apply_boolean_mask(source_table, Column boolean_mask):
     )
 
 
-def drop_duplicates(source_table,
+def drop_duplicates(columns: list,
                     object keys=None,
                     object keep='first',
-                    bool nulls_are_equal=True,
-                    bool ignore_index=False):
+                    bool nulls_are_equal=True):
     """
     Drops rows in source_table as per duplicate rows in keys.
 
     Parameters
     ----------
-    source_table : source_table whose rows gets dropped
-    keys : List of Column names belong to source_table
+    columns : List of columns
+    keys : List of column indices. If set, then these columns are checked for
+           duplicates rather than all of columns (optional)
     keep : keep 'first' or 'last' or none of the duplicate rows
     nulls_are_equal : if True, nulls are treated equal else not.
 
     Returns
     -------
-    Frame with duplicate dropped
+    columns with duplicate dropped
     """
 
+    cdef vector[size_type] cpp_keys = (
+        keys if keys is not None else range(len(columns))
+    )
     cdef duplicate_keep_option cpp_keep_option
 
     if keep == 'first':
@@ -155,30 +146,14 @@ def drop_duplicates(source_table,
     else:
         raise ValueError('keep must be either "first", "last" or False')
 
-    num_index_columns =(
-        0 if (source_table._index is None or ignore_index)
-        else source_table._index._num_columns)
     # shifting the index number by number of index columns
-    cdef vector[size_type] cpp_keys = (
-        [
-            num_index_columns + source_table._column_names.index(name)
-            for name in keys
-        ]
-        if keys is not None
-        else range(
-            num_index_columns, num_index_columns + source_table._num_columns
-        )
-    )
-
     cdef null_equality cpp_nulls_equal = (
         null_equality.EQUAL
         if nulls_are_equal
         else null_equality.UNEQUAL
     )
     cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, ignore_index
-    )
+    cdef table_view source_table_view = table_view_from_columns(columns)
 
     with nogil:
         c_result = move(
@@ -190,13 +165,7 @@ def drop_duplicates(source_table,
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=(
-            None if (source_table._index is None or ignore_index)
-            else source_table._index_names)
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False):
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index a0eb7c68183..96d25cb92c9 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -123,30 +123,6 @@ def transform(Column input, op):
     return Column.from_unique_ptr(move(c_output))
 
 
-def masked_udf(incols, op, output_type):
-    cdef table_view data_view = table_view_from_table(
-        incols, ignore_index=True)
-    cdef string c_str = op.encode("UTF-8")
-    cdef type_id c_tid
-    cdef data_type c_dtype
-
-    c_tid = <type_id> (
-        <underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
-            output_type
-        ]
-    )
-    c_dtype = data_type(c_tid)
-
-    with nogil:
-        c_output = move(libcudf_transform.generalized_masked_op(
-            data_view,
-            c_str,
-            c_dtype,
-        ))
-
-    return Column.from_unique_ptr(move(c_output))
-
-
 def table_encode(input):
     cdef table_view c_input = table_view_from_table(
         input, ignore_index=True)
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 10f76279401..50893ef9838 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -16,3 +16,4 @@ cdef data_from_table_view(
     table_view tv, object owner, object column_names, object index_names=*)
 cdef table_view table_view_from_columns(columns) except *
 cdef table_view table_view_from_table(tbl, ignore_index=*) except*
+cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 18eed2b3396..40edd4bf9a2 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -36,7 +36,6 @@ PARQUET_META_TYPE_MAP = {
     for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items()
 }
 
-
 cdef table_view table_view_from_columns(columns) except*:
     """Create a cudf::table_view from an iterable of Columns."""
     cdef vector[column_view] column_views
@@ -221,6 +220,32 @@ def _index_level_name(index_name, level, column_names):
         return f"__index_level_{level}__"
 
 
+cdef columns_from_unique_ptr(
+    unique_ptr[table] c_tbl
+):
+    """Convert a libcudf table into list of columns.
+
+    Parameters
+    ----------
+    c_tbl : unique_ptr[cudf::table]
+        The libcudf table whose columns will be extracted
+
+    Returns
+    -------
+    list[Column]
+        A list of columns.
+    """
+    cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
+    cdef vector[unique_ptr[column]].iterator it = c_columns.begin()
+
+    cdef size_t i
+
+    columns = [Column.from_unique_ptr(move(dereference(it+i)))
+               for i in range(c_columns.size())]
+
+    return columns
+
+
 cdef data_from_unique_ptr(
     unique_ptr[table] c_tbl, column_names, index_names=None
 ):
@@ -255,13 +280,8 @@ cdef data_from_unique_ptr(
     tuple(Dict[str, Column], Optional[Index])
         A dict of the columns in the output table.
     """
-    cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
-    cdef vector[unique_ptr[column]].iterator it = c_columns.begin()
-
-    cdef size_t i
 
-    columns = [Column.from_unique_ptr(move(dereference(it+i)))
-               for i in range(c_columns.size())]
+    columns = columns_from_unique_ptr(move(c_tbl))
 
     # First construct the index, if any
     index = (
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index d688b75ed14..2fcc976d8e1 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1147,14 +1147,14 @@ def join(
         if isinstance(lhs, cudf.MultiIndex):
             if level is not None and isinstance(level, int):
                 on = lhs._data.select_by_index(level).names[0]
-            right_names = (on,) or right_names
+            right_names = (on,) if on is not None else right_names
             on = right_names[0]
             if how == "outer":
                 how = "left"
             elif how == "right":
                 how = "inner"
         else:
-            # Both are nomal indices
+            # Both are normal indices
             right_names = left_names
             on = right_names[0]
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 6f2f01c746d..1d113f6e159 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -34,7 +34,11 @@
     create_null_mask,
 )
 from cudf._lib.scalar import as_device_scalar
-from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
+from cudf._lib.stream_compaction import (
+    distinct_count as cpp_distinct_count,
+    drop_duplicates,
+    drop_nulls,
+)
 from cudf._lib.transform import bools_to_mask
 from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.api.types import (
@@ -71,7 +75,7 @@
     pandas_dtypes_alias_to_cudf_alias,
     pandas_dtypes_to_np_dtypes,
 )
-from cudf.utils.utils import mask_dtype
+from cudf.utils.utils import _gather_map_is_valid, mask_dtype
 
 T = TypeVar("T", bound="ColumnBase")
 
@@ -200,11 +204,8 @@ def any(self, skipna: bool = True) -> bool:
         return result_col
 
     def dropna(self, drop_nan: bool = False) -> ColumnBase:
-        if drop_nan:
-            col = self.nans_to_nulls()
-        else:
-            col = self
-        return col.as_frame()._drop_na_rows(drop_nan=drop_nan)._as_column()
+        col = self.nans_to_nulls() if drop_nan else self
+        return drop_nulls([col])[0]
 
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
@@ -337,7 +338,7 @@ def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray":
         else:
             return self.dropna(drop_nan=False).data_array_view
 
-    # TODO: This method is decpreated and can be removed when the associated
+    # TODO: This method is deprecated and can be removed when the associated
     # Frame methods are removed.
     def to_array(self, fillna=None) -> np.ndarray:
         """Get a dense numpy array for the data.
@@ -686,28 +687,27 @@ def median(self, skipna: bool = None) -> ScalarLike:
         raise TypeError(f"cannot perform median with type {self.dtype}")
 
     def take(
-        self: T,
-        indices: ColumnBase,
-        keep_index: bool = True,
-        nullify: bool = False,
+        self: T, indices: ColumnBase, nullify: bool = False, check_bounds=True
     ) -> T:
-        """Return Column by taking values from the corresponding *indices*."""
+        """Return Column by taking values from the corresponding *indices*.
+
+        Skip bounds checking if check_bounds is False.
+        Set rows to null for all out of bound indices if nullify is `True`.
+        """
         # Handle zero size
         if indices.size == 0:
             return cast(T, column_empty_like(self, newsize=0))
-        try:
-            return (
-                self.as_frame()
-                ._gather(indices, keep_index=keep_index, nullify=nullify)
-                ._as_column()
-                ._with_type_metadata(self.dtype)
-            )
-        except RuntimeError as e:
-            if "out of bounds" in str(e):
-                raise IndexError(
-                    f"index out of bounds for column of size {len(self)}"
-                ) from e
-            raise
+
+        # TODO: For performance, the check and conversion of gather map should
+        # be done by the caller. This check will be removed in future release.
+        if not is_integer_dtype(indices.dtype):
+            indices = indices.astype("int32")
+        if not _gather_map_is_valid(indices, len(self), check_bounds, nullify):
+            raise IndexError("Gather map index is out of bounds.")
+
+        return libcudf.copying.gather([self], indices, nullify=nullify)[
+            0
+        ]._with_type_metadata(self.dtype)
 
     def isin(self, values: Sequence) -> ColumnBase:
         """Check whether values are contained in the Column.
@@ -1098,11 +1098,7 @@ def unique(self) -> ColumnBase:
         # the following issue resolved:
         # https://github.com/rapidsai/cudf/issues/5286
 
-        return (
-            self.as_frame()
-            .drop_duplicates(keep="first", ignore_index=True)
-            ._as_column()
-        )
+        return drop_duplicates([self], keep="first")[0]
 
     def serialize(self) -> Tuple[dict, list]:
         header: Dict[Any, Any] = {}
@@ -1851,7 +1847,7 @@ def as_column(
 
         arbitrary = np.asarray(arbitrary)
 
-        # Handle case that `arbitary` elements are cupy arrays
+        # Handle case that `arbitrary` elements are cupy arrays
         if (
             shape
             and shape[0]
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 68379002e6b..7c8837ef45f 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -222,8 +222,11 @@ def values(self):
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(self, field)
 
-    def ceil(self, field: str) -> ColumnBase:
-        return libcudf.datetime.ceil_datetime(self, field)
+    def ceil(self, freq: str) -> ColumnBase:
+        return libcudf.datetime.ceil_datetime(self, freq)
+
+    def floor(self, freq: str) -> ColumnBase:
+        return libcudf.datetime.floor_datetime(self, freq)
 
     def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
         if isinstance(other, cudf.Scalar):
@@ -540,7 +543,7 @@ def infer_format(element: str, **kwargs) -> str:
     if len(second_parts) > 1:
         # "Z" indicates Zulu time(widely used in aviation) - Which is
         # UTC timezone that currently cudf only supports. Having any other
-        # unsuppported timezone will let the code fail below
+        # unsupported timezone will let the code fail below
         # with a ValueError.
         second_parts.remove("Z")
         second_part = "".join(second_parts[1:])
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 6409a9f9196..7037b8e6f36 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -160,7 +160,7 @@ def binary_operator(self, op, other, reflect=False):
         if reflect:
             self, other = other, self
 
-        # Binary Arithmatics between decimal columns. `Scale` and `precision`
+        # Binary Arithmetics between decimal columns. `Scale` and `precision`
         # are computed outside of libcudf
         if op in ("add", "sub", "mul", "div"):
             scale = _binop_scale(self.dtype, other.dtype, op)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index a167383c65c..2a91abc5701 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -97,69 +97,6 @@ def str_to_boolean(column: StringColumn):
     cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta,
 }
 
-_NAN_INF_VARIATIONS = [
-    "nan",
-    "NAN",
-    "Nan",
-    "naN",
-    "nAN",
-    "NAn",
-    "nAn",
-    "-inf",
-    "-INF",
-    "-InF",
-    "-inF",
-    "-iNF",
-    "-INf",
-    "-iNf",
-    "+inf",
-    "+INF",
-    "+InF",
-    "+inF",
-    "+iNF",
-    "+INf",
-    "+Inf",
-    "+iNf",
-    "inf",
-    "INF",
-    "InF",
-    "inF",
-    "iNF",
-    "INf",
-    "iNf",
-]
-_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS = [
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-]
-
 
 def _is_supported_regex_flags(flags):
     return flags == 0 or (
@@ -5309,16 +5246,6 @@ def as_numerical_column(
                     "type due to presence of non-integer values."
                 )
         elif out_dtype.kind == "f":
-            # TODO: Replace this `replace` call with a
-            # case-insensitive method once following
-            # issue is fixed: https://github.com/rapidsai/cudf/issues/5217
-            old_values = cudf.core.column.as_column(_NAN_INF_VARIATIONS)
-            new_values = cudf.core.column.as_column(
-                _LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS
-            )
-            string_col = libcudf.replace.replace(
-                string_col, old_values, new_values
-            )
             if not libstrings.is_float(string_col).all():
                 raise ValueError(
                     "Could not convert strings to float "
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b2e6588edb2..c0cb6f1917f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -40,7 +40,7 @@
     is_string_dtype,
     is_struct_dtype,
 )
-from cudf.core import column, reshape
+from cudf.core import column, df_protocol, reshape
 from cudf.core.abc import Serializable
 from cudf.core.column import (
     as_column,
@@ -598,9 +598,12 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
         else:
             if is_list_like(data):
                 if len(data) > 0 and is_scalar(data[0]):
-                    new_df = self._from_columns(
-                        [data], index=index, columns=columns
-                    )
+                    if columns is not None:
+                        data = dict(zip(columns, [data]))
+                    else:
+                        data = dict(enumerate([data]))
+                    new_df = DataFrame(data=data, index=index)
+
                     self._data = new_df._data
                     self.index = new_df._index
                     self.columns = new_df.columns
@@ -3760,11 +3763,8 @@ def join(
                 FutureWarning,
             )
 
-        lhs = self
-        rhs = other
-
-        df = lhs.merge(
-            rhs,
+        df = self.merge(
+            other,
             left_index=True,
             right_index=True,
             how=how,
@@ -3772,7 +3772,7 @@ def join(
             sort=sort,
         )
         df.index.name = (
-            None if lhs.index.name != rhs.index.name else lhs.index.name
+            None if self.index.name != other.index.name else self.index.name
         )
         return df
 
@@ -5093,18 +5093,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             df._index = as_index(index)
         return df
 
-    @classmethod
-    def _from_columns(cls, cols, index=None, columns=None):
-        """
-        Construct a DataFrame from a list of Columns
-        """
-        if columns is not None:
-            data = dict(zip(columns, cols))
-        else:
-            data = dict(enumerate(cols))
-
-        return cls(data=data, index=index,)
-
     def interpolate(
         self,
         method="linear",
@@ -6341,6 +6329,17 @@ def explode(self, column, ignore_index=False):
 
         return super()._explode(column, ignore_index)
 
+    def __dataframe__(
+        self, nan_as_null: bool = False, allow_copy: bool = True
+    ):
+        return df_protocol.__dataframe__(
+            self, nan_as_null=nan_as_null, allow_copy=allow_copy
+        )
+
+
+def from_dataframe(df, allow_copy=False):
+    return df_protocol.from_dataframe(df, allow_copy=allow_copy)
+
 
 def make_binop_func(op, postprocess=None):
     # This function is used to wrap binary operations in Frame with an
@@ -6368,7 +6367,7 @@ def wrapper(self, other, axis="columns", level=None, fill_value=None):
     # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature
     # string of a function by recursively delving into __wrapped__ until
     # it hits the first function that has __signature__ attribute set. To make
-    # the signature stirng of `wrapper` matches with its actual parameter list,
+    # the signature string of `wrapper` matches with its actual parameter list,
     # we directly set the __signature__ attribute of `wrapper` below.
 
     new_sig = inspect.signature(
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
new file mode 100644
index 00000000000..8f258ce27b2
--- /dev/null
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -0,0 +1,829 @@
+import collections
+import enum
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    cast,
+)
+
+import cupy as cp
+import numpy as np
+from numba.cuda import as_cuda_array
+
+import cudf
+from cudf.core.buffer import Buffer
+from cudf.core.column import as_column, build_categorical_column, build_column
+
+# Implementation of interchange protocol classes
+# ----------------------------------------------
+
+
+class _DtypeKind(enum.IntEnum):
+    INT = 0
+    UINT = 1
+    FLOAT = 2
+    BOOL = 20
+    STRING = 21  # UTF-8
+    DATETIME = 22
+    CATEGORICAL = 23
+
+
+class _Device(enum.IntEnum):
+    CPU = 1
+    CUDA = 2
+    CPU_PINNED = 3
+    OPENCL = 4
+    VULKAN = 7
+    METAL = 8
+    VPI = 9
+    ROCM = 10
+
+
+_SUPPORTED_KINDS = {
+    _DtypeKind.INT,
+    _DtypeKind.UINT,
+    _DtypeKind.FLOAT,
+    _DtypeKind.CATEGORICAL,
+    _DtypeKind.BOOL,
+    _DtypeKind.STRING,
+}
+ProtoDtype = Tuple[_DtypeKind, int, str, str]
+
+
+class _CuDFBuffer:
+    """
+    Data in the buffer is guaranteed to be contiguous in memory.
+    """
+
+    def __init__(
+        self,
+        buf: cudf.core.buffer.Buffer,
+        dtype: np.dtype,
+        allow_copy: bool = True,
+    ) -> None:
+        """
+        Use cudf.core.buffer.Buffer object.
+        """
+        # Store the cudf buffer where the data resides as a private
+        # attribute, so we can use it to retrieve the public attributes
+        self._buf = buf
+        self._dtype = dtype
+        self._allow_copy = allow_copy
+
+    @property
+    def bufsize(self) -> int:
+        """
+        Buffer size in bytes.
+        """
+        return self._buf.nbytes
+
+    @property
+    def ptr(self) -> int:
+        """
+        Pointer to start of the buffer as an integer.
+        """
+        return self._buf.ptr
+
+    def __dlpack__(self):
+        """
+        DLPack not implemented in NumPy yet, so leave it out here.
+        """
+        try:
+            cudarray = as_cuda_array(self._buf).view(self._dtype)
+            res = cp.asarray(cudarray).toDlpack()
+
+        except ValueError:
+            raise TypeError(f"dtype {self._dtype} unsupported by `dlpack`")
+
+        return res
+
+    def __dlpack_device__(self) -> Tuple[_Device, int]:
+        """
+        _Device type and _Device ID for where the data in the buffer resides.
+        """
+        return (_Device.CUDA, cp.asarray(self._buf).device.id)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(" + str(
+            {
+                "bufsize": self.bufsize,
+                "ptr": self.ptr,
+                "dlpack": self.__dlpack__(),
+                "device": self.__dlpack_device__()[0].name,
+            }
+        )
+        +")"
+
+
+class _CuDFColumn:
+    """
+    A column object, with only the methods and properties required by the
+    interchange protocol defined.
+
+    A column can contain one or more chunks. Each chunk can contain up to three
+    buffers - a data buffer, a mask buffer (depending on null representation),
+    and an offsets buffer (if variable-size binary; e.g., variable-length
+    strings).
+
+    Note: this Column object can only be produced by ``__dataframe__``, so
+          doesn't need its own version or ``__column__`` protocol.
+
+    """
+
+    def __init__(
+        self,
+        column: cudf.core.column.ColumnBase,
+        nan_as_null: bool = True,
+        allow_copy: bool = True,
+    ) -> None:
+        """
+        Note: doesn't deal with extension arrays yet, just assume a regular
+        Series/ndarray for now.
+        """
+        if not isinstance(column, cudf.core.column.ColumnBase):
+            raise TypeError(
+                "column must be a subtype of df.core.column.ColumnBase,"
+                f"got {type(column)}"
+            )
+        self._col = column
+        self._nan_as_null = nan_as_null
+        self._allow_copy = allow_copy
+
+    @property
+    def size(self) -> int:
+        """
+        Size of the column, in elements.
+        """
+        return self._col.size
+
+    @property
+    def offset(self) -> int:
+        """
+        Offset of first element. Always zero.
+        """
+        return 0
+
+    @property
+    def dtype(self) -> ProtoDtype:
+        """
+        Dtype description as a tuple
+        ``(kind, bit-width, format string, endianness)``
+
+        Kind :
+
+            - INT = 0
+            - UINT = 1
+            - FLOAT = 2
+            - BOOL = 20
+            - STRING = 21   # UTF-8
+            - DATETIME = 22
+            - CATEGORICAL = 23
+
+        Bit-width : the number of bits as an integer
+        Format string : data type description format string in Apache Arrow C
+                        Data Interface format.
+        Endianness : current only native endianness (``=``) is supported
+
+        Notes:
+
+            - Kind specifiers are aligned with DLPack where possible
+             (hence the jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1
+             (for bit masks) or 8 (for byte masks).
+            - Dtype width in bits was preferred over bytes
+            - Endianness isn't too useful, but included now in case
+              in the future we need to support non-native endianness
+            - Went with Apache Arrow format strings over NumPy format strings
+              because they're more complete from a dataframe perspective
+            - Format strings are mostly useful for datetime specification,
+              and for categoricals.
+            - For categoricals, the format string describes the type of the
+              categorical in the data buffer. In case of a separate encoding
+              of the categorical (e.g. an integer to string mapping),
+              this can be derived from ``self.describe_categorical``.
+            - Data types not included: complex, Arrow-style null,
+              binary, decimal, and nested (list, struct, map, union) dtypes.
+        """
+        dtype = self._col.dtype
+
+        # For now, assume that, if the column dtype is 'O' (i.e., `object`),
+        # then we have an array of strings
+        if not isinstance(dtype, cudf.CategoricalDtype) and dtype.kind == "O":
+            return (_DtypeKind.STRING, 8, "u", "=")
+
+        return self._dtype_from_cudfdtype(dtype)
+
+    def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype:
+        """
+        See `self.dtype` for details.
+        """
+        # Note: 'c' (complex) not handled yet (not in array spec v1).
+        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void)
+        #       not handled datetime and timedelta both map to datetime
+        #       (is timedelta handled?)
+        _np_kinds = {
+            "i": _DtypeKind.INT,
+            "u": _DtypeKind.UINT,
+            "f": _DtypeKind.FLOAT,
+            "b": _DtypeKind.BOOL,
+            "U": _DtypeKind.STRING,
+            "M": _DtypeKind.DATETIME,
+            "m": _DtypeKind.DATETIME,
+        }
+        kind = _np_kinds.get(dtype.kind, None)
+        if kind is None:
+            # Not a NumPy/CuPy dtype. Check if it's a categorical maybe
+            if isinstance(dtype, cudf.CategoricalDtype):
+                kind = _DtypeKind.CATEGORICAL
+                # Codes and categories' dtypes are different.
+                # We use codes' dtype as these are stored in the buffer.
+                codes = cast(
+                    cudf.core.column.CategoricalColumn, self._col
+                ).codes
+                dtype = codes.dtype
+            else:
+                raise ValueError(
+                    f"Data type {dtype} not supported by exchange protocol"
+                )
+
+        if kind not in _SUPPORTED_KINDS:
+            raise NotImplementedError(f"Data type {dtype} not handled yet")
+
+        bitwidth = dtype.itemsize * 8
+        format_str = dtype.str
+        endianness = dtype.byteorder if kind != _DtypeKind.CATEGORICAL else "="
+        return (kind, bitwidth, format_str, endianness)
+
+    @property
+    def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]:
+        """
+        If the dtype is categorical, there are two options:
+
+        - There are only values in the data buffer.
+        - There is a separate dictionary-style encoding for categorical values.
+
+        Raises TypeError if the dtype is not categorical
+
+        Content of returned dict:
+
+            - "is_ordered" : bool, whether the ordering of dictionary
+                             indices is semantically meaningful.
+            - "is_dictionary" : bool, whether a dictionary-style mapping of
+                                categorical values to other objects exists
+            - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
+                          None if not a dictionary-style categorical.
+        """
+        if not self.dtype[0] == _DtypeKind.CATEGORICAL:
+            raise TypeError(
+                "`describe_categorical only works on "
+                "a column with categorical dtype!"
+            )
+        categ_col = cast(cudf.core.column.CategoricalColumn, self._col)
+        ordered = bool(categ_col.dtype.ordered)
+        is_dictionary = True
+        # NOTE: this shows the children approach is better, transforming
+        # `categories` to a "mapping" dict is inefficient
+        categories = categ_col.categories
+        mapping = {ix: val for ix, val in enumerate(categories.values_host)}
+        return ordered, is_dictionary, mapping
+
+    @property
+    def describe_null(self) -> Tuple[int, Any]:
+        """
+        Return the missing value (or "null") representation the column dtype
+        uses, as a tuple ``(kind, value)``.
+
+        Kind:
+
+            - 0 : non-nullable
+            - 1 : NaN/NaT
+            - 2 : sentinel value
+            - 3 : bit mask
+            - 4 : byte mask
+
+        Value : if kind is "sentinel value", the actual value.
+        If kind is a bit mask or a byte mask, the value (0 or 1)
+        indicating a missing value.
+        None otherwise.
+        """
+        kind = self.dtype[0]
+        if self.null_count == 0:
+            # there is no validity mask so it is non-nullable
+            return 0, None
+
+        elif kind in _SUPPORTED_KINDS:
+            # bit mask is universally used in cudf for missing
+            return 3, 0
+
+        else:
+            raise NotImplementedError(
+                f"Data type {self.dtype} not yet supported"
+            )
+
+    @property
+    def null_count(self) -> int:
+        """
+        Number of null elements. Should always be known.
+        """
+        return self._col.null_count
+
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        """
+        Store specific metadata of the column.
+        """
+        return {}
+
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the column consists of.
+        """
+        return 1
+
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable["_CuDFColumn"]:
+        """
+        Return an iterable yielding the chunks.
+
+        See `DataFrame.get_chunks` for details on ``n_chunks``.
+        """
+        return (self,)
+
+    def get_buffers(
+        self,
+    ) -> Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]]:
+        """
+        Return a dictionary containing the underlying buffers.
+
+        The returned dictionary has the following contents:
+
+            - "data": a two-element tuple whose first element is a buffer
+                      containing the data and whose second element is the data
+                      buffer's associated dtype.
+            - "validity": a two-element tuple whose first element is a buffer
+                          containing mask values indicating missing data and
+                          whose second element is the mask value buffer's
+                          associated dtype. None if the null representation is
+                          not a bit or byte mask.
+            - "offsets": a two-element tuple whose first element is a buffer
+                         containing the offset values for variable-size binary
+                         data (e.g., variable-length strings) and whose second
+                         element is the offsets buffer's associated dtype. None
+                         if the data buffer does not have an associated offsets
+                         buffer.
+        """
+        buffers = {}
+        try:
+            buffers["validity"] = self._get_validity_buffer()
+        except RuntimeError:
+            buffers["validity"] = None
+
+        try:
+            buffers["offsets"] = self._get_offsets_buffer()
+        except RuntimeError:
+            buffers["offsets"] = None
+
+        buffers["data"] = self._get_data_buffer()
+
+        return buffers
+
+    def _get_validity_buffer(
+        self,
+    ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]:
+        """
+        Return the buffer containing the mask values
+        indicating missing data and the buffer's associated dtype.
+
+        Raises RuntimeError if null representation is not a bit or byte mask.
+        """
+
+        null, invalid = self.describe_null
+        if null == 3:
+            if self.dtype[0] == _DtypeKind.CATEGORICAL:
+                valid_mask = cast(
+                    cudf.core.column.CategoricalColumn, self._col
+                ).codes._get_mask_as_column()
+            else:
+                valid_mask = self._col._get_mask_as_column()
+
+            assert (valid_mask is not None) and (
+                valid_mask.data is not None
+            ), "valid_mask(.data) should not be None when "
+            "_CuDFColumn.describe_null[0] = 3"
+            buffer = _CuDFBuffer(
+                valid_mask.data, cp.uint8, allow_copy=self._allow_copy
+            )
+            dtype = (_DtypeKind.UINT, 8, "C", "=")
+            return buffer, dtype
+
+        elif null == 1:
+            raise RuntimeError(
+                "This column uses NaN as null "
+                "so does not have a separate mask"
+            )
+        elif null == 0:
+            raise RuntimeError(
+                "This column is non-nullable so does not have a mask"
+            )
+        else:
+            raise NotImplementedError(
+                f"See {self.__class__.__name__}.describe_null method."
+            )
+
+    def _get_offsets_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]:
+        """
+        Return the buffer containing the offset values for
+        variable-size binary data (e.g., variable-length strings)
+        and the buffer's associated dtype.
+
+        Raises RuntimeError if the data buffer does not have an associated
+        offsets buffer.
+        """
+        if self.dtype[0] == _DtypeKind.STRING:
+            offsets = self._col.children[0]
+            assert (offsets is not None) and (offsets.data is not None), " "
+            "offsets(.data) should not be None for string column"
+
+            buffer = _CuDFBuffer(
+                offsets.data, offsets.dtype, allow_copy=self._allow_copy
+            )
+            dtype = self._dtype_from_cudfdtype(offsets.dtype)
+        else:
+            raise RuntimeError(
+                "This column has a fixed-length dtype "
+                "so does not have an offsets buffer"
+            )
+
+        return buffer, dtype
+
+    def _get_data_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
+        """
+        Return the buffer containing the data and
+               the buffer's associated dtype.
+        """
+        if self.dtype[0] in (
+            _DtypeKind.INT,
+            _DtypeKind.UINT,
+            _DtypeKind.FLOAT,
+            _DtypeKind.BOOL,
+        ):
+            col_data = self._col
+            dtype = self.dtype
+
+        elif self.dtype[0] == _DtypeKind.CATEGORICAL:
+            col_data = cast(
+                cudf.core.column.CategoricalColumn, self._col
+            ).codes
+            dtype = self._dtype_from_cudfdtype(col_data.dtype)
+
+        elif self.dtype[0] == _DtypeKind.STRING:
+            col_data = self._col.children[1]
+            dtype = self._dtype_from_cudfdtype(col_data.dtype)
+
+        else:
+            raise NotImplementedError(
+                f"Data type {self._col.dtype} not handled yet"
+            )
+        assert (col_data is not None) and (col_data.data is not None), " "
+        f"col_data(.data) should not be None when dtype = {dtype}"
+        buffer = _CuDFBuffer(
+            col_data.data, col_data.dtype, allow_copy=self._allow_copy
+        )
+
+        return buffer, dtype
+
+
+class _CuDFDataFrame:
+    """
+    A data frame class, with only the methods required by the interchange
+    protocol defined.
+
+    Instances of this (private) class are returned from
+    ``cudf.DataFrame.__dataframe__`` as objects with the methods and
+    attributes defined on this class.
+    """
+
+    def __init__(
+        self,
+        df: "cudf.core.dataframe.DataFrame",
+        nan_as_null: bool = True,
+        allow_copy: bool = True,
+    ) -> None:
+        """
+        Constructor - an instance of this (private) class is returned from
+        `cudf.DataFrame.__dataframe__`.
+        """
+        self._df = df
+        # ``nan_as_null`` is a keyword intended for the consumer to tell the
+        # producer to overwrite null values in the data with
+        # ``NaN`` (or ``NaT``).
+        # This currently has no effect; once support for nullable extension
+        # dtypes is added, this value should be propagated to columns.
+        self._nan_as_null = nan_as_null
+        self._allow_copy = allow_copy
+
+    @property
+    def metadata(self):
+        # `index` isn't a regular column, and the protocol doesn't support row
+        # labels - so we export it as cuDF-specific metadata here.
+        return {"cudf.index": self._df.index}
+
+    def num_columns(self) -> int:
+        return len(self._df.columns)
+
+    def num_rows(self) -> int:
+        return len(self._df)
+
+    def num_chunks(self) -> int:
+        return 1
+
+    def column_names(self) -> Iterable[str]:
+        return self._df.columns.tolist()
+
+    def get_column(self, i: int) -> _CuDFColumn:
+        return _CuDFColumn(
+            as_column(self._df.iloc[:, i]), allow_copy=self._allow_copy
+        )
+
+    def get_column_by_name(self, name: str) -> _CuDFColumn:
+        return _CuDFColumn(
+            as_column(self._df[name]), allow_copy=self._allow_copy
+        )
+
+    def get_columns(self) -> Iterable[_CuDFColumn]:
+        return [
+            _CuDFColumn(as_column(self._df[name]), allow_copy=self._allow_copy)
+            for name in self._df.columns
+        ]
+
+    def select_columns(self, indices: Sequence[int]) -> "_CuDFDataFrame":
+        if not isinstance(indices, collections.abc.Sequence):
+            raise ValueError("`indices` is not a sequence")
+
+        return _CuDFDataFrame(self._df.iloc[:, indices])
+
+    def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame":
+        if not isinstance(names, collections.Sequence):
+            raise ValueError("`names` is not a sequence")
+
+        return _CuDFDataFrame(
+            self._df.loc[:, names], self._nan_as_null, self._allow_copy
+        )
+
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable["_CuDFDataFrame"]:
+        """
+        Return an iterator yielding the chunks.
+        """
+        return (self,)
+
+
+def __dataframe__(
+    self, nan_as_null: bool = False, allow_copy: bool = True
+) -> _CuDFDataFrame:
+    """
+    The public method to attach to cudf.DataFrame.
+
+    ``nan_as_null`` is a keyword intended for the consumer to tell the
+    producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
+    This currently has no effect; once support for nullable extension
+    dtypes is added, this value should be propagated to columns.
+
+    ``allow_copy`` is a keyword that defines whether or not the library is
+    allowed to make a copy of the data. For example, copying data would be
+    necessary if a library supports strided buffers, given that this protocol
+    specifies contiguous buffers.
+    """
+    return _CuDFDataFrame(self, nan_as_null=nan_as_null, allow_copy=allow_copy)
+
+
+"""
+Implementation of the dataframe exchange protocol.
+
+Public API
+----------
+
+from_dataframe : construct a cudf.DataFrame from an input data frame which
+                 implements the exchange protocol
+
+Notes
+-----
+
+- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to
+  do in pure Python. It's more general but definitely less friendly than
+  having ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack
+  ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack),
+  this is worth looking at again.
+
+"""
+
+
+# A typing protocol could be added later to let Mypy validate code using
+# `from_dataframe` better.
+DataFrameObject = Any
+ColumnObject = Any
+
+
+_INTS = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64}
+_UINTS = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64}
+_FLOATS = {32: cp.float32, 64: cp.float64}
+_CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}}
+
+
+def from_dataframe(
+    df: DataFrameObject, allow_copy: bool = False
+) -> _CuDFDataFrame:
+    """
+    Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__``
+    """
+    if isinstance(df, cudf.DataFrame):
+        return df
+
+    if not hasattr(df, "__dataframe__"):
+        raise ValueError("`df` does not support __dataframe__")
+
+    return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
+
+
+def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame:
+    """
+    Create a cudf DataFrame object from DataFrameObject.
+    """
+    # Check number of chunks, if there's more than one we need to iterate
+    if df.num_chunks() > 1:
+        raise NotImplementedError("More than one chunk not handled yet")
+
+    # We need a dict of columns here, with each column being a cudf column.
+    columns = dict()
+    _buffers = []  # hold on to buffers, keeps memory alive
+    for name in df.column_names():
+        col = df.get_column_by_name(name)
+
+        if col.dtype[0] in (
+            _DtypeKind.INT,
+            _DtypeKind.UINT,
+            _DtypeKind.FLOAT,
+            _DtypeKind.BOOL,
+        ):
+            columns[name], _buf = _protocol_to_cudf_column_numeric(col)
+
+        elif col.dtype[0] == _DtypeKind.CATEGORICAL:
+            columns[name], _buf = _protocol_to_cudf_column_categorical(col)
+
+        elif col.dtype[0] == _DtypeKind.STRING:
+            columns[name], _buf = _protocol_to_cudf_column_string(col)
+
+        else:
+            raise NotImplementedError(
+                f"Data type {col.dtype[0]} not handled yet"
+            )
+
+        _buffers.append(_buf)
+
+    df_new = cudf.DataFrame._from_data(columns)
+    df_new._buffers = _buffers
+    return df_new
+
+
+def _protocol_to_cudf_column_numeric(
+    col: _CuDFColumn,
+) -> Tuple[
+    cudf.core.column.ColumnBase,
+    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+]:
+    """
+    Convert an int, uint, float or bool protocol column
+    to the corresponding cudf column
+    """
+    if col.offset != 0:
+        raise NotImplementedError("column.offset > 0 not handled yet")
+
+    buffers = col.get_buffers()
+    assert buffers["data"] is not None, "data buffer should not be None"
+    _dbuffer, _ddtype = buffers["data"]
+    _check_buffer_is_on_gpu(_dbuffer)
+    cudfcol_num = build_column(
+        Buffer(_dbuffer.ptr, _dbuffer.bufsize),
+        protocol_dtype_to_cupy_dtype(_ddtype),
+    )
+    return _set_missing_values(col, cudfcol_num), buffers
+
+
+def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None:
+    if (
+        buffer.__dlpack_device__()[0] != _Device.CUDA
+        and not buffer._allow_copy
+    ):
+        raise TypeError(
+            "This operation must copy data from CPU to GPU. "
+            "Set `allow_copy=True` to allow it."
+        )
+
+    elif buffer.__dlpack_device__()[0] != _Device.CUDA and buffer._allow_copy:
+        raise NotImplementedError(
+            "Only cuDF/GPU dataframes are supported for now. "
+            "CPU (like `Pandas`) dataframes will be supported shortly."
+        )
+
+
+def _set_missing_values(
+    protocol_col: _CuDFColumn, cudf_col: cudf.core.column.ColumnBase
+) -> cudf.core.column.ColumnBase:
+
+    valid_mask = protocol_col.get_buffers()["validity"]
+    if valid_mask is not None:
+        bitmask = cp.asarray(
+            Buffer(valid_mask[0].ptr, valid_mask[0].bufsize), cp.bool8
+        )
+        cudf_col[~bitmask] = None
+
+    return cudf_col
+
+
+def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype:
+    kind = _dtype[0]
+    bitwidth = _dtype[1]
+    if _dtype[0] not in _SUPPORTED_KINDS:
+        raise RuntimeError(f"Data type {_dtype[0]} not handled yet")
+
+    return _CP_DTYPES[kind][bitwidth]
+
+
+def _protocol_to_cudf_column_categorical(
+    col: _CuDFColumn,
+) -> Tuple[
+    cudf.core.column.ColumnBase,
+    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+]:
+    """
+    Convert a categorical column to a Series instance
+    """
+    ordered, is_dict, mapping = col.describe_categorical
+    if not is_dict:
+        raise NotImplementedError(
+            "Non-dictionary categoricals not supported yet"
+        )
+
+    categories = as_column(mapping.values())
+    buffers = col.get_buffers()
+    assert buffers["data"] is not None, "data buffer should not be None"
+    codes_buffer, codes_dtype = buffers["data"]
+    _check_buffer_is_on_gpu(codes_buffer)
+    cdtype = protocol_dtype_to_cupy_dtype(codes_dtype)
+    codes = build_column(
+        Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype
+    )
+
+    cudfcol = build_categorical_column(
+        categories=categories,
+        codes=codes,
+        mask=codes.base_mask,
+        size=codes.size,
+        ordered=ordered,
+    )
+
+    return _set_missing_values(col, cudfcol), buffers
+
+
+def _protocol_to_cudf_column_string(
+    col: _CuDFColumn,
+) -> Tuple[
+    cudf.core.column.ColumnBase,
+    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+]:
+    """
+    Convert a string ColumnObject to cudf Column object.
+    """
+    # Retrieve the data buffers
+    buffers = col.get_buffers()
+
+    # Retrieve the data buffer containing the UTF-8 code units
+    assert buffers["data"] is not None, "data buffer should never be None"
+    data_buffer, data_dtype = buffers["data"]
+    _check_buffer_is_on_gpu(data_buffer)
+    encoded_string = build_column(
+        Buffer(data_buffer.ptr, data_buffer.bufsize),
+        protocol_dtype_to_cupy_dtype(data_dtype),
+    )
+
+    # Retrieve the offsets buffer containing the index offsets demarcating
+    # the beginning and end of each string
+    assert buffers["offsets"] is not None, "not possible for string column"
+    offset_buffer, offset_dtype = buffers["offsets"]
+    _check_buffer_is_on_gpu(offset_buffer)
+    offsets = build_column(
+        Buffer(offset_buffer.ptr, offset_buffer.bufsize),
+        protocol_dtype_to_cupy_dtype(offset_dtype),
+    )
+
+    cudfcol_str = build_column(
+        None, dtype=cp.dtype("O"), children=(offsets, encoded_string)
+    )
+    return _set_missing_values(col, cudfcol_str), buffers
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index c0858398492..d7a75cb9f40 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -46,12 +46,13 @@
     serialize_columns,
 )
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.join import merge
+from cudf.core.join import Merge, MergeSemi
 from cudf.core.udf.pipeline import compile_or_get, supported_cols_from_frame
 from cudf.core.window import Rolling
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import find_common_type, is_column_like
+from cudf.utils.utils import _gather_map_is_valid
 
 T = TypeVar("T", bound="Frame")
 
@@ -140,6 +141,37 @@ def _from_data(
         Frame.__init__(obj, data, index)
         return obj
 
+    @classmethod
+    def _from_columns(
+        cls,
+        columns: List[ColumnBase],
+        column_names: List[str],
+        index_names: Optional[List[str]] = None,
+    ):
+        """Construct a `Frame` object from a list of columns.
+
+        If `index_names` is set, the first `len(index_names)` columns are
+        used to construct the index of the frame.
+        """
+        index = None
+        n_index_columns = 0
+        if index_names is not None:
+            n_index_columns = len(index_names)
+            index = cudf.core.index._index_from_data(
+                dict(zip(range(n_index_columns), columns))
+            )
+            if isinstance(index, cudf.MultiIndex):
+                index.names = index_names
+            else:
+                index.name = index_names[0]
+
+        data = {
+            name: columns[i + n_index_columns]
+            for i, name in enumerate(column_names)
+        }
+
+        return cls._from_data(data, index)
+
     def _mimic_inplace(
         self: T, result: Frame, inplace: bool = False
     ) -> Optional[Frame]:
@@ -520,22 +552,32 @@ def _get_columns_by_index(self, indices):
     def _gather(
         self, gather_map, keep_index=True, nullify=False, check_bounds=True
     ):
+        """Gather rows of frame specified by indices in `gather_map`.
+
+        Skip bounds checking if check_bounds is False.
+        Set rows to null for all out of bound indices if nullify is `True`.
+        """
+        # TODO: `keep_index` argument is to be removed.
+        gather_map = cudf.core.column.as_column(gather_map)
+
+        # TODO: For performance, the check and conversion of gather map should
+        # be done by the caller. This check will be removed in future release.
         if not is_integer_dtype(gather_map.dtype):
             gather_map = gather_map.astype("int32")
-        result = self.__class__._from_data(
-            *libcudf.copying.gather(
-                self,
-                as_column(gather_map),
-                keep_index=keep_index,
-                nullify=nullify,
-                check_bounds=check_bounds,
-            )
+
+        if not _gather_map_is_valid(
+            gather_map, len(self), check_bounds, nullify
+        ):
+            raise IndexError("Gather map index is out of bounds.")
+
+        result = self.__class__._from_columns(
+            libcudf.copying.gather(
+                list(self._columns), gather_map, nullify=nullify,
+            ),
+            self._column_names,
         )
 
-        result._copy_type_metadata(self, include_index=keep_index)
-        result._data.names = self._data.names
-        if keep_index and self._index is not None:
-            result._index.names = self._index.names
+        result._copy_type_metadata(self)
         return result
 
     def _hash(self, method, initial_hash=None):
@@ -1396,10 +1438,8 @@ def _drop_na_rows(
         diff = set(subset) - set(self._data)
         if len(diff) != 0:
             raise KeyError(f"columns {diff} do not exist")
-        subset_cols = [
-            name for name, col in self._data.items() if name in subset
-        ]
-        if len(subset_cols) == 0:
+
+        if len(subset) == 0:
             return self.copy(deep=True)
 
         frame = self.copy(deep=False)
@@ -1412,16 +1452,19 @@ def _drop_na_rows(
                 else:
                     frame._data[name] = col
 
-        result = self.__class__._from_data(
-            *libcudf.stream_compaction.drop_nulls(
-                frame, how=how, keys=subset, thresh=thresh
-            )
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.drop_nulls(
+                list(self._index._data.columns + frame._columns),
+                how=how,
+                keys=self._positions_from_column_names(
+                    subset, offset_by_index_columns=True
+                ),
+                thresh=thresh,
+            ),
+            self._column_names,
+            self._index.names,
         )
         result._copy_type_metadata(frame)
-        if self._index is not None:
-            result._index.name = self._index.name
-            if isinstance(self._index, cudf.MultiIndex):
-                result._index.names = self._index.names
         return result
 
     def _drop_na_columns(self, how="any", subset=None, thresh=None):
@@ -2262,55 +2305,45 @@ def to_arrow(self):
         )
 
     def drop_duplicates(
-        self,
-        subset=None,
-        keep="first",
-        nulls_are_equal=True,
-        ignore_index=False,
+        self, keep="first", nulls_are_equal=True,
     ):
         """
-        Drops rows in frame as per duplicate rows in `subset` columns from
-        self.
+        Drop duplicate rows in frame.
 
-        subset : list, optional
-            List of columns to consider when dropping rows.
-        keep : ["first", "last", False] first will keep first of duplicate,
-            last will keep last of the duplicate and False drop all
-            duplicate
-        nulls_are_equal: null elements are considered equal to other null
-            elements
-        ignore_index: bool, default False
-            If True, the resulting axis will be labeled 0, 1, …, n - 1.
+        keep : ["first", "last", False], default "first"
+            "first" will keep the first duplicate entry, "last" will keep the
+            last duplicate entry, and False will drop all duplicates.
+        nulls_are_equal: bool, default True
+            Null elements are considered equal to other null elements.
         """
-        if subset is None:
-            subset = self._column_names
-        elif (
-            not np.iterable(subset)
-            or isinstance(subset, str)
-            or isinstance(subset, tuple)
-            and subset in self._data.names
-        ):
-            subset = (subset,)
-        diff = set(subset) - set(self._data)
-        if len(diff) != 0:
-            raise KeyError(f"columns {diff} do not exist")
-        subset_cols = [name for name in self._column_names if name in subset]
-        if len(subset_cols) == 0:
-            return self.copy(deep=True)
 
-        result = self.__class__._from_data(
-            *libcudf.stream_compaction.drop_duplicates(
-                self,
-                keys=subset,
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.drop_duplicates(
+                list(self._columns),
+                keys=range(len(self._columns)),
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
-                ignore_index=ignore_index,
-            )
+            ),
+            self._column_names,
         )
-
+        # TODO: _copy_type_metadata is a common pattern to apply after the
+        # roundtrip from libcudf. We should build this into a factory function
+        # to increase reusability.
         result._copy_type_metadata(self)
         return result
 
+    def _positions_from_column_names(self, column_names):
+        """Map each column name into their positions in the frame.
+
+        The order of indices returned corresponds to the column order in this
+        Frame.
+        """
+        return [
+            i
+            for i, name in enumerate(self._column_names)
+            if name in set(column_names)
+        ]
+
     def replace(
         self,
         to_replace=None,
@@ -2589,7 +2622,10 @@ def _copy_type_metadata(
                     self._index, cudf.core.index.CategoricalIndex
                 ):
                     self._index = cudf.Index(
-                        cast(cudf.core.index.NumericIndex, self._index)._column
+                        cast(
+                            cudf.core.index.NumericIndex, self._index
+                        )._column,
+                        name=self._index.name,
                     )
 
         return self
@@ -3673,6 +3709,13 @@ def ceil(self):
         3    5.0
         dtype: float64
         """
+
+        warnings.warn(
+            "Series.ceil and DataFrame.ceil are deprecated and will be \
+                removed in the future",
+            DeprecationWarning,
+        )
+
         return self._unaryop("ceil")
 
     def floor(self):
@@ -3705,6 +3748,13 @@ def floor(self):
         5    3.0
         dtype: float64
         """
+
+        warnings.warn(
+            "Series.ceil and DataFrame.ceil are deprecated and will be \
+                removed in the future",
+            DeprecationWarning,
+        )
+
         return self._unaryop("floor")
 
     def scale(self):
@@ -3755,6 +3805,7 @@ def _merge(
         suffixes=("_x", "_y"),
     ):
         lhs, rhs = self, right
+        merge_cls = Merge
         if how == "right":
             # Merge doesn't support right, so just swap
             how = "left"
@@ -3762,8 +3813,10 @@ def _merge(
             left_on, right_on = right_on, left_on
             left_index, right_index = right_index, left_index
             suffixes = (suffixes[1], suffixes[0])
+        elif how in {"leftsemi", "leftanti"}:
+            merge_cls = MergeSemi
 
-        return merge(
+        return merge_cls(
             lhs,
             rhs,
             on=on,
@@ -3775,7 +3828,7 @@ def _merge(
             sort=sort,
             indicator=indicator,
             suffixes=suffixes,
-        )
+        ).perform_merge()
 
     def _is_sorted(self, ascending=None, null_position=None):
         """
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index ba69e42674a..f1d622362e2 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import collections
+import itertools
 import pickle
 import warnings
 
@@ -13,12 +14,13 @@
 from cudf._typing import DataFrameOrSeries
 from cudf.api.types import is_list_like
 from cudf.core.abc import Serializable
-from cudf.core.column.column import arange
+from cudf.core.column.column import arange, as_column
+from cudf.core.multiindex import MultiIndex
 from cudf.utils.utils import GetAttrGetItemMixin, cached_property
 
 
 # The three functions below return the quantiles [25%, 50%, 75%]
-# respectively, which are called in the describe() method to ouput
+# respectively, which are called in the describe() method to output
 # the summary stats of a GroupBy object
 def _quantile_25(x):
     return x.quantile(0.25)
@@ -69,6 +71,8 @@ def __init__(
         """
         self.obj = obj
         self._as_index = as_index
+        self._by = by
+        self._level = level
         self._sort = sort
         self._dropna = dropna
 
@@ -777,6 +781,121 @@ def median(self):
         """Get the column-wise median of the values in each group."""
         return self.agg("median")
 
+    def corr(self, method="pearson", min_periods=1):
+        """
+        Compute pairwise correlation of columns, excluding NA/null values.
+
+        Parameters
+        ----------
+        method: {"pearson", "kendall", "spearman"} or callable,
+            default "pearson". Currently only the pearson correlation
+            coefficient is supported.
+
+        min_periods: int, optional
+            Minimum number of observations required per pair of columns
+            to have a valid result.
+
+        Returns
+        ----------
+        DataFrame
+            Correlation matrix.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gdf = cudf.DataFrame({
+        ...             "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+        ...             "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+        ...             "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        ...             "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]})
+        >>> gdf
+        id  val1  val2  val3
+        0  a     5     4     4
+        1  a     4     5     5
+        2  a     6     6     6
+        3  b     4     1     1
+        4  b     8     2     2
+        5  b     7     9     9
+        6  c     4     8     8
+        7  c     5     5     5
+        8  c     2     1     1
+        >>> gdf.groupby("id").corr(method="pearson")
+                    val1      val2      val3
+        id
+        a   val1  1.000000  0.500000  0.500000
+            val2  0.500000  1.000000  1.000000
+            val3  0.500000  1.000000  1.000000
+        b   val1  1.000000  0.385727  0.385727
+            val2  0.385727  1.000000  1.000000
+            val3  0.385727  1.000000  1.000000
+        c   val1  1.000000  0.714575  0.714575
+            val2  0.714575  1.000000  1.000000
+            val3  0.714575  1.000000  1.000000
+        """
+
+        if not method.lower() in ("pearson",):
+            raise NotImplementedError(
+                "Only pearson correlation is currently supported"
+            )
+
+        # create expanded dataframe consisting all combinations of the
+        # struct columns-pairs to be correlated
+        # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
+        _cols = self.grouping.values.columns.tolist()
+        len_cols = len(_cols)
+
+        new_df_data = {}
+        for x, y in itertools.combinations_with_replacement(_cols, 2):
+            new_df_data[(x, y)] = cudf.DataFrame._from_data(
+                {"x": self.obj._data[x], "y": self.obj._data[y]}
+            ).to_struct()
+        new_gb = cudf.DataFrame._from_data(new_df_data).groupby(
+            by=self.grouping.keys
+        )
+
+        try:
+            gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods))
+        except RuntimeError as e:
+            if "Unsupported groupby reduction type-agg combination" in str(e):
+                raise TypeError(
+                    "Correlation accepts only numerical column-pairs"
+                )
+            raise
+
+        # ensure that column-pair labels are arranged in ascending order
+        cols_list = [
+            (y, x) if i > j else (x, y)
+            for j, y in enumerate(_cols)
+            for i, x in enumerate(_cols)
+        ]
+        cols_split = [
+            cols_list[i : i + len_cols]
+            for i in range(0, len(cols_list), len_cols)
+        ]
+
+        # interleave: combine the correlation results for each column-pair
+        # into a single column
+        res = cudf.DataFrame._from_data(
+            {
+                x: gb_corr.loc[:, i].interleave_columns()
+                for i, x in zip(cols_split, _cols)
+            }
+        )
+
+        # create a multiindex for the groupby correlated dataframe,
+        # to match pandas behavior
+        unsorted_idx = gb_corr.index.repeat(len_cols)
+        idx_sort_order = unsorted_idx._get_sorted_inds()
+        sorted_idx = unsorted_idx._gather(idx_sort_order)
+        if len(gb_corr):
+            # TO-DO: Should the operation below be done on the CPU instead?
+            sorted_idx._data[None] = as_column(
+                cudf.Series(_cols).tile(len(gb_corr.index))
+            )
+        res.index = MultiIndex._from_data(sorted_idx._data)
+
+        return res
+
     def var(self, ddof=1):
         """Compute the column-wise variance of the values in each group.
 
@@ -1178,18 +1297,6 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
 
     _PROTECTED_KEYS = frozenset(("obj",))
 
-    def __init__(
-        self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
-    ):
-        super().__init__(
-            obj=obj,
-            by=by,
-            level=level,
-            sort=sort,
-            as_index=as_index,
-            dropna=dropna,
-        )
-
     def __getitem__(self, key):
         return self.obj[key].groupby(
             self.grouping, dropna=self._dropna, sort=self._sort
@@ -1262,18 +1369,6 @@ class SeriesGroupBy(GroupBy):
     Name: Max Speed, dtype: float64
     """
 
-    def __init__(
-        self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
-    ):
-        super().__init__(
-            obj=obj,
-            by=by,
-            level=level,
-            sort=sort,
-            as_index=as_index,
-            dropna=dropna,
-        )
-
     def agg(self, func):
         result = super().agg(func)
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 35b80715cca..8f905ee6d49 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -86,6 +86,7 @@ def _lexsorted_equal_range(
 
 def _index_from_data(data: MutableMapping, name: Any = None):
     """Construct an index of the appropriate type from some data."""
+
     if len(data) == 0:
         raise ValueError("Cannot construct Index from any empty Table")
     if len(data) == 1:
@@ -623,7 +624,7 @@ def _union(self, other, sort=None):
                 else:
                     return result
 
-        # If all the above optimizations don't cater to the inpputs,
+        # If all the above optimizations don't cater to the inputs,
         # we materialize RangeIndex's into `Int64Index` and
         # then perform `union`.
         return Int64Index(self._values)._union(other, sort=sort)
@@ -770,34 +771,6 @@ def deserialize(cls, header, frames):
 
         return super().deserialize(header, frames)
 
-    def drop_duplicates(self, keep="first"):
-        """
-        Return Index with duplicate values removed
-
-        Parameters
-        ----------
-        keep : {‘first’, ‘last’, False}, default ‘first’
-            * ‘first’ : Drop duplicates except for the
-                first occurrence.
-            * ‘last’ : Drop duplicates except for the
-                last occurrence.
-            *  False : Drop all duplicates.
-
-        Returns
-        -------
-        Index
-
-        Examples
-        --------
-        >>> import cudf
-        >>> idx = cudf.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'])
-        >>> idx
-        StringIndex(['lama' 'cow' 'lama' 'beetle' 'lama' 'hippo'], dtype='object')
-        >>> idx.drop_duplicates()
-        StringIndex(['beetle' 'cow' 'hippo' 'lama'], dtype='object')
-        """  # noqa: E501
-        return super().drop_duplicates(keep=keep)
-
     def _binaryop(
         self,
         other: T,
@@ -1898,6 +1871,68 @@ def _get_dt_field(self, field):
     def is_boolean(self):
         return False
 
+    def ceil(self, field):
+        """
+        Perform ceil operation on the data to the specified freq.
+
+        Parameters
+        ----------
+        field : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        DatetimeIndex
+            Index of the same type for a DatetimeIndex
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00",
+        ... "1999-12-31 18:40:00"])
+        >>> gIndex.ceil("T")
+        DatetimeIndex(['2020-05-31 08:00:00', '1999-12-31 18:40:00'],
+        dtype='datetime64[ns]', freq=None)
+        """
+        out_column = self._values.ceil(field)
+
+        return self.__class__._from_data({self.name: out_column})
+
+    def floor(self, field):
+        """
+        Perform floor operation on the data to the specified freq.
+
+        Parameters
+        ----------
+        field : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        DatetimeIndex
+            Index of the same type for a DatetimeIndex
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:59:59"
+        ... ,"1999-12-31 18:44:59"])
+        >>> gIndex.floor("T")
+        DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'],
+        dtype='datetime64[ns]', freq=None)
+        """
+        out_column = self._values.floor(field)
+
+        return self.__class__._from_data({self.name: out_column})
+
 
 class TimedeltaIndex(GenericIndex):
     """
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index cf12907d96a..2044bad9675 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -8,17 +8,19 @@
 from uuid import uuid4
 
 import cupy as cp
+import numpy as np
 import pandas as pd
 from nvtx import annotate
 
 import cudf
+import cudf._lib as libcudf
 from cudf._typing import ColumnLike
-from cudf.api.types import is_categorical_dtype, is_list_like
+from cudf.api.types import is_categorical_dtype, is_integer_dtype, is_list_like
 from cudf.core.column import arange
 from cudf.core.frame import Frame
 from cudf.core.index import Index
 from cudf.core.multiindex import MultiIndex
-from cudf.utils.utils import cached_property
+from cudf.utils.utils import _gather_map_is_valid, cached_property
 
 
 def _indices_from_labels(obj, labels):
@@ -435,6 +437,113 @@ def sort_index(
             out = out.reset_index(drop=True)
         return self._mimic_inplace(out, inplace=inplace)
 
+    def _gather(
+        self, gather_map, keep_index=True, nullify=False, check_bounds=True
+    ):
+        """Gather rows of frame specified by indices in `gather_map`.
+
+        Skip bounds checking if check_bounds is False.
+        Set rows to null for all out of bound indices if nullify is `True`.
+        """
+        gather_map = cudf.core.column.as_column(gather_map)
+
+        # TODO: For performance, the check and conversion of gather map should
+        # be done by the caller. This check will be removed in future release.
+        if not is_integer_dtype(gather_map.dtype):
+            gather_map = gather_map.astype("int32")
+
+        if not _gather_map_is_valid(
+            gather_map, len(self), check_bounds, nullify
+        ):
+            raise IndexError("Gather map index is out of bounds.")
+
+        result = self.__class__._from_columns(
+            libcudf.copying.gather(
+                list(self._index._columns + self._columns)
+                if keep_index
+                else list(self._columns),
+                gather_map,
+                nullify=nullify,
+            ),
+            self._column_names,
+            self._index.names if keep_index else None,
+        )
+
+        result._copy_type_metadata(self, include_index=keep_index)
+        return result
+
+    def _positions_from_column_names(
+        self, column_names, offset_by_index_columns=False
+    ):
+        """Map each column name into their positions in the frame.
+
+        Return positions of the provided column names, offset by the number of
+        index columns `offset_by_index_columns` is True. The order of indices
+        returned corresponds to the column order in this Frame.
+        """
+        num_index_columns = (
+            len(self._index._data) if offset_by_index_columns else 0
+        )
+        return [
+            i + num_index_columns
+            for i, name in enumerate(self._column_names)
+            if name in set(column_names)
+        ]
+
+    def drop_duplicates(
+        self,
+        subset=None,
+        keep="first",
+        nulls_are_equal=True,
+        ignore_index=False,
+    ):
+        """
+        Drop duplicate rows in frame.
+
+        subset : list, optional
+            List of columns to consider when dropping rows.
+        keep : ["first", "last", False]
+            "first" will keep the first duplicate entry, "last" will keep the
+            last duplicate entry, and False will drop all duplicates.
+        nulls_are_equal: bool, default True
+            Null elements are considered equal to other null elements.
+        ignore_index: bool, default False
+            If True, the resulting axis will be labeled 0, 1, ..., n - 1.
+        """
+        if subset is None:
+            subset = self._column_names
+        elif (
+            not np.iterable(subset)
+            or isinstance(subset, str)
+            or isinstance(subset, tuple)
+            and subset in self._data.names
+        ):
+            subset = (subset,)
+        diff = set(subset) - set(self._data)
+        if len(diff) != 0:
+            raise KeyError(f"columns {diff} do not exist")
+        subset_cols = [name for name in self._column_names if name in subset]
+        if len(subset_cols) == 0:
+            return self.copy(deep=True)
+
+        keys = self._positions_from_column_names(
+            subset, offset_by_index_columns=not ignore_index
+        )
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.drop_duplicates(
+                list(self._columns)
+                if ignore_index
+                else list(self._index._columns + self._columns),
+                keys=keys,
+                keep=keep,
+                nulls_are_equal=nulls_are_equal,
+            ),
+            self._column_names,
+            self._index.names if not ignore_index else None,
+        )
+        result._copy_type_metadata(self)
+        return result
+
     def sort_values(
         self,
         by,
diff --git a/python/cudf/cudf/core/join/__init__.py b/python/cudf/cudf/core/join/__init__.py
index 0463b8f9df1..71a91c398ad 100644
--- a/python/cudf/cudf/core/join/__init__.py
+++ b/python/cudf/cudf/core/join/__init__.py
@@ -1,3 +1,3 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
-from cudf.core.join.join import merge
+from cudf.core.join.join import Merge, MergeSemi
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index cc9c0fb66da..6dec0b10273 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -3,16 +3,17 @@
 
 import collections
 import warnings
-from typing import TYPE_CHECKING, Any, Iterable, Tuple
+from typing import TYPE_CHECKING, Any, Tuple, cast
 
 import numpy as np
-import pandas as pd
 
 import cudf
+from cudf.api.types import is_dtype_equal
+from cudf.core.column import CategoricalColumn
 from cudf.core.dtypes import CategoricalDtype
 
 if TYPE_CHECKING:
-    from cudf.core.column import CategoricalColumn, ColumnBase
+    from cudf.core.column import ColumnBase
     from cudf.core.frame import Frame
 
 
@@ -28,61 +29,36 @@ class _Indexer:
     # >>> _Indexer("a", column=True).get(df)  # returns column "a" of df
     # >>> _Indexer("b", index=True).get(df)  # returns index level "b" of df
 
-    def __init__(self, name: Any, column=False, index=False):
-        if column and index:
-            raise ValueError("Cannot specify both column and index")
+    def __init__(self, name: Any):
         self.name = name
-        self.column, self.index = column, index
 
+
+class _ColumnIndexer(_Indexer):
     def get(self, obj: Frame) -> ColumnBase:
-        # get the column from `obj`
-        if self.column:
-            return obj._data[self.name]
-        else:
-            if obj._index is not None:
-                return obj._index._data[self.name]
-        raise KeyError()
+        return obj._data[self.name]
 
     def set(self, obj: Frame, value: ColumnBase, validate=False):
-        # set the colum in `obj`
-        if self.column:
-            obj._data.set_by_label(self.name, value, validate=validate)
-        else:
-            if obj._index is not None:
-                obj._index._data.set_by_label(
-                    self.name, value, validate=validate
-                )
-            else:
-                raise KeyError()
-
-
-def _frame_select_by_indexers(
-    frame: Frame, indexers: Iterable[_Indexer]
-) -> Frame:
-    # Select columns from the given `Frame` using `indexers`,
-    # and return a new `Frame`.
-    index_data = frame._data.__class__()
-    data = frame._data.__class__()
-
-    for idx in indexers:
-        if idx.index:
-            index_data.set_by_label(idx.name, idx.get(frame), validate=False)
-        else:
-            data.set_by_label(idx.name, idx.get(frame), validate=False)
+        obj._data.set_by_label(self.name, value, validate=validate)
 
-    result_index = (
-        cudf.core.index._index_from_data(index_data) if index_data else None
-    )
-    result = cudf.core.frame.Frame(data=data, index=result_index)
-    return result
+
+class _IndexIndexer(_Indexer):
+    def get(self, obj: Frame) -> ColumnBase:
+        if obj._index is not None:
+            return obj._index._data[self.name]
+        raise KeyError
+
+    def set(self, obj: Frame, value: ColumnBase, validate=False):
+        if obj._index is not None:
+            obj._index._data.set_by_label(self.name, value, validate=validate)
+        else:
+            raise KeyError
 
 
 def _match_join_keys(
     lcol: ColumnBase, rcol: ColumnBase, how: str
 ) -> Tuple[ColumnBase, ColumnBase]:
-    # returns the common dtype that lcol and rcol should be casted to,
-    # before they can be used as left and right join keys.
-    # If no casting is necessary, returns None
+    # Casts lcol and rcol to a common dtype for use as join keys. If no casting
+    # is necessary, they are returned as is.
 
     common_type = None
 
@@ -91,12 +67,22 @@ def _match_join_keys(
     rtype = rcol.dtype
 
     # if either side is categorical, different logic
-    if isinstance(ltype, CategoricalDtype) or isinstance(
-        rtype, CategoricalDtype
-    ):
-        return _match_categorical_dtypes(lcol, rcol, how)
+    left_is_categorical = isinstance(ltype, CategoricalDtype)
+    right_is_categorical = isinstance(rtype, CategoricalDtype)
+    if left_is_categorical and right_is_categorical:
+        return _match_categorical_dtypes_both(
+            cast(CategoricalColumn, lcol), cast(CategoricalColumn, rcol), how
+        )
+    elif left_is_categorical or right_is_categorical:
+        if left_is_categorical:
+            if how in {"left", "leftsemi", "leftanti"}:
+                return lcol, rcol.astype(ltype)
+            common_type = ltype.categories.dtype
+        else:
+            common_type = rtype.categories.dtype
+        return lcol.astype(common_type), rcol.astype(common_type)
 
-    if pd.api.types.is_dtype_equal(ltype, rtype):
+    if is_dtype_equal(ltype, rtype):
         return lcol, rcol
 
     if isinstance(ltype, cudf.Decimal64Dtype) or isinstance(
@@ -131,34 +117,9 @@ def _match_join_keys(
     return lcol.astype(common_type), rcol.astype(common_type)
 
 
-def _match_categorical_dtypes(
-    lcol: ColumnBase, rcol: ColumnBase, how: str
-) -> Tuple[ColumnBase, ColumnBase]:
-    # cast the keys lcol and rcol to a common dtype
-    # when at least one of them is a categorical type
-    ltype, rtype = lcol.dtype, rcol.dtype
-
-    if isinstance(lcol, cudf.core.column.CategoricalColumn) and isinstance(
-        rcol, cudf.core.column.CategoricalColumn
-    ):
-        # if both are categoricals, logic is complicated:
-        return _match_categorical_dtypes_both(lcol, rcol, how)
-
-    if isinstance(ltype, CategoricalDtype):
-        if how in {"left", "leftsemi", "leftanti"}:
-            return lcol, rcol.astype(ltype)
-        common_type = ltype.categories.dtype
-    elif isinstance(rtype, CategoricalDtype):
-        common_type = rtype.categories.dtype
-    return lcol.astype(common_type), rcol.astype(common_type)
-
-
 def _match_categorical_dtypes_both(
     lcol: CategoricalColumn, rcol: CategoricalColumn, how: str
 ) -> Tuple[ColumnBase, ColumnBase]:
-    # The commontype depends on both `how` and the specifics of the
-    # categorical variables to be merged.
-
     ltype, rtype = lcol.dtype, rcol.dtype
 
     # when both are ordered and both have the same categories,
@@ -184,9 +145,6 @@ def _match_categorical_dtypes_both(
             "neither side is ordered"
         )
 
-    # the following should now always hold
-    assert not ltype.ordered and not rtype.ordered
-
     if how == "inner":
         # cast to category types -- we must cast them back later
         return _match_join_keys(
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 28b2d5d8167..dd8f462fb1d 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,16 +1,14 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 from __future__ import annotations
 
-import functools
-from collections import namedtuple
-from typing import TYPE_CHECKING, Callable, Tuple
+from typing import TYPE_CHECKING, Callable
 
 import cudf
 from cudf import _lib as libcudf
 from cudf.core.join._join_helpers import (
     _coerce_to_tuple,
-    _frame_select_by_indexers,
-    _Indexer,
+    _ColumnIndexer,
+    _IndexIndexer,
     _match_join_keys,
 )
 
@@ -18,47 +16,7 @@
     from cudf.core.frame import Frame
 
 
-def merge(
-    lhs,
-    rhs,
-    *,
-    on,
-    left_on,
-    right_on,
-    left_index,
-    right_index,
-    how,
-    sort,
-    indicator,
-    suffixes,
-):
-    if how in {"leftsemi", "leftanti"}:
-        merge_cls = MergeSemi
-    else:
-        merge_cls = Merge
-    mergeobj = merge_cls(
-        lhs,
-        rhs,
-        on=on,
-        left_on=left_on,
-        right_on=right_on,
-        left_index=left_index,
-        right_index=right_index,
-        how=how,
-        sort=sort,
-        indicator=indicator,
-        suffixes=suffixes,
-    )
-    return mergeobj.perform_merge()
-
-
-_JoinKeys = namedtuple("JoinKeys", ["left", "right"])
-
-
-class Merge(object):
-    # A namedtuple of indexers representing the left and right keys
-    _keys: _JoinKeys
-
+class Merge:
     # The joiner function must have the following signature:
     #
     #     def joiner(
@@ -71,7 +29,7 @@ class Merge(object):
     # join key. The `joiner` returns a tuple of two Columns
     # representing the rows to gather from the left- and right- side
     # tables respectively.
-    _joiner: Callable
+    _joiner: Callable = libcudf.join.join
 
     def __init__(
         self,
@@ -133,150 +91,157 @@ def __init__(
             how=how,
             suffixes=suffixes,
         )
-        self._joiner = functools.partial(libcudf.join.join, how=how)
-
-        self.lhs = lhs
-        self.rhs = rhs
-        self.on = on
-        self.left_on = left_on
-        self.right_on = right_on
-        self.left_index = left_index
-        self.right_index = right_index
+
+        self.lhs = lhs.copy(deep=False)
+        self.rhs = rhs.copy(deep=False)
         self.how = how
         self.sort = sort
-        if suffixes:
-            self.lsuffix, self.rsuffix = suffixes
-        self._compute_join_keys()
-
-    @property
-    def _out_class(self):
-        # type of the result
-        out_class = cudf.DataFrame
+        self.lsuffix, self.rsuffix = suffixes
+
+        # At this point validation guarantees that if on is not None we
+        # don't have any other args, so we can apply it directly to left_on and
+        # right_on.
+        self._using_left_index = bool(left_index)
+        left_on = (
+            lhs.index._data.names if left_index else left_on if left_on else on
+        )
+        self._using_right_index = bool(right_index)
+        right_on = (
+            rhs.index._data.names
+            if right_index
+            else right_on
+            if right_on
+            else on
+        )
 
-        if isinstance(self.lhs, cudf.MultiIndex) or isinstance(
-            self.rhs, cudf.MultiIndex
+        if left_on or right_on:
+            self._left_keys = [
+                _ColumnIndexer(name=on)
+                if not self._using_left_index and on in lhs._data
+                else _IndexIndexer(name=on)
+                for on in (_coerce_to_tuple(left_on) if left_on else [])
+            ]
+            self._right_keys = [
+                _ColumnIndexer(name=on)
+                if not self._using_right_index and on in rhs._data
+                else _IndexIndexer(name=on)
+                for on in (_coerce_to_tuple(right_on) if right_on else [])
+            ]
+            if len(self._left_keys) != len(self._right_keys):
+                raise ValueError(
+                    "Merge operands must have same number of join key columns"
+                )
+            self._using_left_index = any(
+                isinstance(idx, _IndexIndexer) for idx in self._left_keys
+            )
+            self._using_right_index = any(
+                isinstance(idx, _IndexIndexer) for idx in self._right_keys
+            )
+        else:
+            # if `on` is not provided and we're not merging
+            # index with column or on both indexes, then use
+            # the intersection  of columns in both frames
+            on_names = set(lhs._data) & set(rhs._data)
+            self._left_keys = [_ColumnIndexer(name=on) for on in on_names]
+            self._right_keys = [_ColumnIndexer(name=on) for on in on_names]
+            self._using_left_index = False
+            self._using_right_index = False
+
+        if isinstance(lhs, cudf.MultiIndex) or isinstance(
+            rhs, cudf.MultiIndex
         ):
-            out_class = cudf.MultiIndex
-        elif isinstance(self.lhs, cudf.BaseIndex):
-            out_class = self.lhs.__class__
-        return out_class
+            self._out_class = cudf.MultiIndex
+        elif isinstance(lhs, cudf.BaseIndex):
+            self._out_class = lhs.__class__
+        else:
+            self._out_class = cudf.DataFrame
+
+        self._key_columns_with_same_name = (
+            set(_coerce_to_tuple(on))
+            if on
+            else set()
+            if (self._using_left_index or self._using_right_index)
+            else set(
+                [
+                    lkey.name
+                    for lkey, rkey in zip(self._left_keys, self._right_keys)
+                    if lkey.name == rkey.name
+                ]
+            )
+        )
 
     def perform_merge(self) -> Frame:
-        lhs, rhs = self._match_key_dtypes(self.lhs, self.rhs)
-
-        left_table = _frame_select_by_indexers(lhs, self._keys.left)
-        right_table = _frame_select_by_indexers(rhs, self._keys.right)
+        left_join_cols = {}
+        right_join_cols = {}
+
+        for left_key, right_key in zip(self._left_keys, self._right_keys):
+            lcol = left_key.get(self.lhs)
+            rcol = right_key.get(self.rhs)
+            lcol_casted, rcol_casted = _match_join_keys(lcol, rcol, self.how)
+            left_join_cols[left_key.name] = lcol_casted
+            right_join_cols[left_key.name] = rcol_casted
+
+            # Categorical dtypes must be cast back from the underlying codes
+            # type that was returned by _match_join_keys.
+            if (
+                self.how == "inner"
+                and isinstance(lcol.dtype, cudf.CategoricalDtype)
+                and isinstance(rcol.dtype, cudf.CategoricalDtype)
+            ):
+                lcol_casted = lcol_casted.astype("category")
+                rcol_casted = rcol_casted.astype("category")
+
+            left_key.set(self.lhs, lcol_casted, validate=False)
+            right_key.set(self.rhs, rcol_casted, validate=False)
 
         left_rows, right_rows = self._joiner(
-            left_table, right_table, how=self.how,
+            cudf.core.frame.Frame(left_join_cols),
+            cudf.core.frame.Frame(right_join_cols),
+            how=self.how,
         )
-        lhs, rhs = self._restore_categorical_keys(lhs, rhs)
 
-        left_result = cudf.core.frame.Frame()
-        right_result = cudf.core.frame.Frame()
+        gather_index = self._using_left_index or self._using_right_index
 
-        gather_index = self.left_index or self.right_index
-        if left_rows is not None:
-            left_result = lhs._gather(
+        left_result = (
+            self.lhs._gather(
                 left_rows,
                 nullify=True,
                 keep_index=gather_index,
                 check_bounds=False,
             )
-        if right_rows is not None:
-            right_result = rhs._gather(
+            if left_rows is not None
+            else cudf.core.frame.Frame()
+        )
+        right_result = (
+            self.rhs._gather(
                 right_rows,
                 nullify=True,
                 keep_index=gather_index,
                 check_bounds=False,
             )
+            if right_rows is not None
+            else cudf.core.frame.Frame()
+        )
 
-        result = self._merge_results(left_result, right_result)
+        result = self._out_class._from_data(
+            *self._merge_results(left_result, right_result)
+        )
 
         if self.sort:
             result = self._sort_result(result)
         return result
 
-    def _compute_join_keys(self):
-        # Computes self._keys
-        left_keys = []
-        right_keys = []
-        if (
-            self.left_index
-            or self.right_index
-            or self.left_on
-            or self.right_on
-        ):
-            if self.left_index:
-                left_keys.extend(
-                    [
-                        _Indexer(name=on, index=True)
-                        for on in self.lhs.index._data.names
-                    ]
-                )
-            if self.left_on:
-                # TODO: require left_on or left_index to be specified
-                left_keys.extend(
-                    [
-                        _Indexer(name=on, column=True)
-                        for on in _coerce_to_tuple(self.left_on)
-                    ]
-                )
-            if self.right_index:
-                right_keys.extend(
-                    [
-                        _Indexer(name=on, index=True)
-                        for on in self.rhs.index._data.names
-                    ]
-                )
-            if self.right_on:
-                # TODO: require right_on or right_index to be specified
-                right_keys.extend(
-                    [
-                        _Indexer(name=on, column=True)
-                        for on in _coerce_to_tuple(self.right_on)
-                    ]
-                )
-        elif self.on:
-            on_names = _coerce_to_tuple(self.on)
-            for on in on_names:
-                # If `on` is provided, Merge on columns if present,
-                # otherwise default to indexes.
-                if on in self.lhs._data:
-                    left_keys.append(_Indexer(name=on, column=True))
-                else:
-                    left_keys.append(_Indexer(name=on, index=True))
-                if on in self.rhs._data:
-                    right_keys.append(_Indexer(name=on, column=True))
-                else:
-                    right_keys.append(_Indexer(name=on, index=True))
-
-        else:
-            # if `on` is not provided and we're not merging
-            # index with column or on both indexes, then use
-            # the intersection  of columns in both frames
-            on_names = set(self.lhs._data) & set(self.rhs._data)
-            left_keys = [_Indexer(name=on, column=True) for on in on_names]
-            right_keys = [_Indexer(name=on, column=True) for on in on_names]
-
-        if len(left_keys) != len(right_keys):
-            raise ValueError(
-                "Merge operands must have same number of join key columns"
-            )
-
-        self._keys = _JoinKeys(left=left_keys, right=right_keys)
-
-    def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame:
+    def _merge_results(self, left_result: Frame, right_result: Frame):
         # Merge the Frames `left_result` and `right_result` into a single
         # `Frame`, suffixing column names if necessary.
 
         # If two key columns have the same name, a single output column appears
-        # in the result. For all other join types, the key column from the rhs
-        # is simply dropped. For outer joins, the two key columns are combined
-        # by filling nulls in the left key column with corresponding values
-        # from the right key column:
+        # in the result. For all non-outer join types, the key column from the
+        # rhs is simply dropped. For outer joins, the two key columns are
+        # combined by filling nulls in the left key column with corresponding
+        # values from the right key column:
         if self.how == "outer":
-            for lkey, rkey in zip(*self._keys):
+            for lkey, rkey in zip(self._left_keys, self._right_keys):
                 if lkey.name == rkey.name:
                     # fill nulls in lhs from values in the rhs
                     lkey.set(
@@ -285,36 +250,26 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame:
                         validate=False,
                     )
 
-        # Compute the result column names:
-        # left_names and right_names will be a mappings of input column names
-        # to the corresponding names in the final result.
-        left_names = dict(zip(left_result._data, left_result._data))
-        right_names = dict(zip(right_result._data, right_result._data))
-
-        # For any columns from left_result and right_result that have the same
-        # name:
-        # - if they are key columns, keep only the left column
-        # - if they are not key columns, use suffixes to differentiate them
-        #   in the final result
-        common_names = set(left_names) & set(right_names)
-
-        if self.on:
-            key_columns_with_same_name = self.on
-        else:
-            key_columns_with_same_name = [
-                lkey.name
-                for lkey, rkey in zip(*self._keys)
-                if (
-                    (lkey.index, rkey.index) == (False, False)
-                    and lkey.name == rkey.name
-                )
-            ]
-        for name in common_names:
-            if name not in key_columns_with_same_name:
-                left_names[name] = f"{name}{self.lsuffix}"
-                right_names[name] = f"{name}{self.rsuffix}"
+        # All columns from the left table make it into the output. Non-key
+        # columns that share a name with a column in the right table are
+        # suffixed with the provided suffix.
+        common_names = set(left_result._data.names) & set(
+            right_result._data.names
+        )
+        cols_to_suffix = common_names - self._key_columns_with_same_name
+        data = {
+            (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col
+            for name, col in left_result._data.items()
+        }
+
+        # The right table follows the same rule as the left table except that
+        # key columns from the right table are removed.
+        for name, col in right_result._data.items():
+            if name in common_names:
+                if name not in self._key_columns_with_same_name:
+                    data[f"{name}{self.rsuffix}"] = col
             else:
-                del right_names[name]
+                data[name] = col
 
         # determine if the result has multiindex columns.  The result
         # of a join has a MultiIndex as its columns if:
@@ -333,69 +288,44 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame:
         else:
             multiindex_columns = False
 
-        # Assemble the data columns of the result
-        data = left_result._data.__class__(multiindex=multiindex_columns)
-
-        for lcol in left_names:
-            data.set_by_label(
-                left_names[lcol], left_result._data[lcol], validate=False
-            )
-        for rcol in right_names:
-            data.set_by_label(
-                right_names[rcol], right_result._data[rcol], validate=False
-            )
-
-        # Index of the result:
-        if self.left_index and self.right_index:
+        if self._using_right_index:
+            # right_index and left_on
             index = left_result._index
-        elif self.left_index:
+        elif self._using_left_index:
             # left_index and right_on
             index = right_result._index
-        elif self.right_index:
-            # right_index and left_on
-            index = left_result._index
         else:
             index = None
 
         # Construct result from data and index:
-        result = self._out_class._from_data(data=data, index=index)
-
-        return result
+        return (
+            left_result._data.__class__(
+                data=data, multiindex=multiindex_columns
+            ),
+            index,
+        )
 
     def _sort_result(self, result: Frame) -> Frame:
         # Pandas sorts on the key columns in the
         # same order as given in 'on'. If the indices are used as
         # keys, the index will be sorted. If one index is specified,
         # the key columns on the other side will be used to sort.
-        if self.on:
-            if isinstance(result, cudf.BaseIndex):
-                sort_order = result._get_sorted_inds()
-            else:
-                # need a list instead of a tuple here because
-                # _get_sorted_inds calls down to ColumnAccessor.get_by_label
-                # which handles lists and tuples differently
-                sort_order = result._get_sorted_inds(
-                    list(_coerce_to_tuple(self.on))
-                )
-            return result._gather(
-                sort_order, keep_index=False, check_bounds=False
-            )
         by = []
-        if self.left_index and self.right_index:
+        if self._using_left_index and self._using_right_index:
             if result._index is not None:
                 by.extend(result._index._data.columns)
-        if self.left_on:
-            by.extend(
-                [result._data[col] for col in _coerce_to_tuple(self.left_on)]
-            )
-        if self.right_on:
-            by.extend(
-                [result._data[col] for col in _coerce_to_tuple(self.right_on)]
-            )
+        if not self._using_left_index:
+            by.extend([result._data[col.name] for col in self._left_keys])
+        if not self._using_right_index:
+            by.extend([result._data[col.name] for col in self._right_keys])
         if by:
-            to_sort = cudf.DataFrame._from_columns(by)
+            to_sort = cudf.DataFrame._from_data(dict(enumerate(by)))
             sort_order = to_sort.argsort()
-            result = result._gather(sort_order, check_bounds=False)
+            result = result._gather(
+                sort_order,
+                keep_index=self._using_left_index or self._using_right_index,
+                check_bounds=False,
+            )
         return result
 
     @staticmethod
@@ -410,10 +340,9 @@ def _validate_merge_params(
         how,
         suffixes,
     ):
-        """
-        Error for various invalid combinations of merge input parameters
-        """
-        # must actually support the requested merge type
+        # Error for various invalid combinations of merge input parameters
+
+        # We must actually support the requested merge type
         if how not in {"left", "inner", "outer", "leftanti", "leftsemi"}:
             raise NotImplementedError(f"{how} merge not supported yet")
 
@@ -424,15 +353,55 @@ def _validate_merge_params(
                     'Can only pass argument "on" OR "left_on" '
                     'and "right_on", not a combination of both.'
                 )
+            elif left_index or right_index:
+                # Passing 'on' with 'left_index' or 'right_index' is ambiguous
+                raise ValueError(
+                    'Can only pass argument "on" OR "left_index" '
+                    'and "right_index", not a combination of both.'
+                )
             else:
                 # the validity of 'on' being checked by _Indexer
                 return
+        elif left_on and left_index:
+            raise ValueError(
+                'Can only pass argument "left_on" OR "left_index" not both.'
+            )
+        elif right_on and right_index:
+            raise ValueError(
+                'Can only pass argument "right_on" OR "right_index" not both.'
+            )
+
+        # Can't merge on a column name that is present in both a frame and its
+        # indexes.
+        if on:
+            for key in on:
+                if (key in lhs._data and key in lhs.index._data) or (
+                    key in rhs._data and key in rhs.index._data
+                ):
+                    raise ValueError(
+                        f"{key} is both an index level and a "
+                        "column label, which is ambiguous."
+                    )
+        if left_on:
+            for key in left_on:
+                if key in lhs._data and key in lhs.index._data:
+                    raise ValueError(
+                        f"{key} is both an index level and a "
+                        "column label, which is ambiguous."
+                    )
+        if right_on:
+            for key in right_on:
+                if key in rhs._data and key in rhs.index._data:
+                    raise ValueError(
+                        f"{key} is both an index level and a "
+                        "column label, which is ambiguous."
+                    )
 
         # Can't merge on unnamed Series
         if (isinstance(lhs, cudf.Series) and not lhs.name) or (
             isinstance(rhs, cudf.Series) and not rhs.name
         ):
-            raise ValueError("Can not merge on unnamed Series")
+            raise ValueError("Cannot merge on unnamed Series")
 
         # If nothing specified, must have common cols to use implicitly
         same_named_columns = set(lhs._data) & set(rhs._data)
@@ -459,59 +428,15 @@ def _validate_merge_params(
                         "lsuffix and rsuffix are not defined"
                     )
 
-    def _match_key_dtypes(self, lhs: Frame, rhs: Frame) -> Tuple[Frame, Frame]:
-        # Match the dtypes of the key columns from lhs and rhs
-        out_lhs = lhs.copy(deep=False)
-        out_rhs = rhs.copy(deep=False)
-        for left_key, right_key in zip(*self._keys):
-            lcol, rcol = left_key.get(lhs), right_key.get(rhs)
-            lcol_casted, rcol_casted = _match_join_keys(
-                lcol, rcol, how=self.how
-            )
-            if lcol is not lcol_casted:
-                left_key.set(out_lhs, lcol_casted, validate=False)
-            if rcol is not rcol_casted:
-                right_key.set(out_rhs, rcol_casted, validate=False)
-        return out_lhs, out_rhs
-
-    def _restore_categorical_keys(
-        self, lhs: Frame, rhs: Frame
-    ) -> Tuple[Frame, Frame]:
-        # For inner joins, any categorical keys in `self.lhs` and `self.rhs`
-        # were casted to their category type to produce `lhs` and `rhs`.
-        # Here, we cast them back.
-        out_lhs = lhs.copy(deep=False)
-        out_rhs = rhs.copy(deep=False)
-        if self.how == "inner":
-            for left_key, right_key in zip(*self._keys):
-                if isinstance(
-                    left_key.get(self.lhs).dtype, cudf.CategoricalDtype
-                ) and isinstance(
-                    right_key.get(self.rhs).dtype, cudf.CategoricalDtype
-                ):
-                    left_key.set(
-                        out_lhs,
-                        left_key.get(out_lhs).astype("category"),
-                        validate=False,
-                    )
-                    right_key.set(
-                        out_rhs,
-                        right_key.get(out_rhs).astype("category"),
-                        validate=False,
-                    )
-        return out_lhs, out_rhs
-
 
 class MergeSemi(Merge):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._joiner = functools.partial(
-            libcudf.join.semi_join, how=kwargs["how"]
-        )
+    _joiner: Callable = libcudf.join.semi_join
 
-    def _merge_results(self, lhs: Frame, rhs: Frame) -> Frame:
+    def _merge_results(self, lhs: Frame, rhs: Frame):
         # semi-join result includes only lhs columns
-        if issubclass(self._out_class, cudf.Index):
-            return self._out_class._from_data(lhs._data)
-        else:
-            return self._out_class._from_data(lhs._data, index=lhs._index)
+        return (
+            lhs._data,
+            lhs._index
+            if not issubclass(self._out_class, cudf.Index)
+            else None,
+        )
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 8c4f87d5f67..e0c68e56f63 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -137,9 +137,9 @@ def __init__(
             else:
                 level = cudf.DataFrame({column_name: levels[i]})
 
-            source_data[column_name] = libcudf.copying.gather(level, col)[0][
-                column_name
-            ]
+            source_data[column_name] = libcudf.copying.gather(
+                [level._data[column_name]], col
+            )[0]
 
         super().__init__(source_data)
         self._levels = levels
@@ -970,7 +970,7 @@ def _concat(cls, objs):
 
         source_data = [o.to_frame(index=False) for o in objs]
 
-        # TODO: Verify if this is really necesary or if we can rely on
+        # TODO: Verify if this is really necessary or if we can rely on
         # DataFrame._concat.
         if len(source_data) > 1:
             colnames = source_data[0].columns
@@ -1409,7 +1409,7 @@ def fillna(self, value):
         return super().fillna(value=value)
 
     def unique(self):
-        return self.drop_duplicates(ignore_index=True)
+        return self.drop_duplicates(keep="first")
 
     def _clean_nulls_from_index(self):
         """
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 00a8ebabe34..cf035ef457d 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2916,7 +2916,7 @@ def unique(self):
 
     def nunique(self, method="sort", dropna=True):
         """Returns the number of unique values of the Series: approximate version,
-        and exact version to be moved to libgdf
+        and exact version to be moved to libcudf
 
         Excludes NA values by default.
 
@@ -2985,7 +2985,7 @@ def value_counts(
 
         Returns
         -------
-        result : Series contanining counts of unique values.
+        result : Series containing counts of unique values.
 
         See also
         --------
@@ -3802,7 +3802,7 @@ def wrapper(self, other, level=None, fill_value=None, axis=0):
     # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature
     # string of a function by recursively delving into __wrapped__ until
     # it hits the first function that has __signature__ attribute set. To make
-    # the signature stirng of `wrapper` matches with its actual parameter list,
+    # the signature string of `wrapper` matches with its actual parameter list,
     # we directly set the __signature__ attribute of `wrapper` below.
 
     new_sig = inspect.signature(
@@ -4592,11 +4592,76 @@ def _get_dt_field(self, field):
             data=out_column, index=self.series._index, name=self.series.name
         )
 
-    def ceil(self, field):
-        out_column = self.series._column.ceil(field)
+    def ceil(self, freq):
+        """
+        Perform ceil operation on the data to the specified freq.
 
-        return Series(
-            data=out_column, index=self.series._index, name=self.series.name
+        Parameters
+        ----------
+        freq : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        Series
+            Series with all timestamps rounded up to the specified frequency.
+            The index is preserved.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> t = cudf.Series(["2001-01-01 00:04:45", "2001-01-01 00:04:58",
+        ... "2001-01-01 00:05:04"], dtype="datetime64[ns]")
+        >>> t.dt.ceil("T")
+        0   2001-01-01 00:05:00
+        1   2001-01-01 00:05:00
+        2   2001-01-01 00:06:00
+        dtype: datetime64[ns]
+        """
+        out_column = self.series._column.ceil(freq)
+
+        return Series._from_data(
+            data={self.series.name: out_column}, index=self.series._index
+        )
+
+    def floor(self, freq):
+        """
+        Perform floor operation on the data to the specified freq.
+
+        Parameters
+        ----------
+        freq : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        Series
+            Series with all timestamps rounded up to the specified frequency.
+            The index is preserved.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> t = cudf.Series(["2001-01-01 00:04:45", "2001-01-01 00:04:58",
+        ... "2001-01-01 00:05:04"], dtype="datetime64[ns]")
+        >>> t.dt.floor("T")
+        0   2001-01-01 00:04:00
+        1   2001-01-01 00:04:00
+        2   2001-01-01 00:05:00
+        dtype: datetime64[ns]
+        """
+        out_column = self.series._column.floor(freq)
+
+        return Series._from_data(
+            data={self.series.name: out_column}, index=self.series._index
         )
 
     def strftime(self, date_format, *args, **kwargs):
@@ -4989,7 +5054,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     """Returns a boolean array where two arrays are equal within a tolerance.
 
-    Two values in ``a`` and ``b`` are  considiered equal when the following
+    Two values in ``a`` and ``b`` are  considered equal when the following
     equation is satisfied.
 
     .. math::
diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py
index deb4546e8b8..2464906be04 100644
--- a/python/cudf/cudf/core/udf/pipeline.py
+++ b/python/cudf/cudf/core/udf/pipeline.py
@@ -316,7 +316,7 @@ def compile_or_get(frame, func, args):
     Return a compiled kernel in terms of MaskedTypes that launches a
     kernel equivalent of `f` for the dtypes of `df`. The kernel uses
     a thread for each row and calls `f` using that rows data / mask
-    to produce an output value and output valdity for each row.
+    to produce an output value and output validity for each row.
 
     If the UDF has already been compiled for this requested dtypes,
     a cached version will be returned instead of running compilation.
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 4b0f0bf1283..da7ff4c0e32 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -67,7 +67,7 @@ def unify(self, context, other):
         """
         Often within a UDF an instance arises where a variable could
         be a `MaskedType`, an `NAType`, or a literal based off
-        the data at runtime, for examplem the variable `ret` here:
+        the data at runtime, for example the variable `ret` here:
 
         def f(x):
             if x == 1:
@@ -185,7 +185,7 @@ class NAType(types.Type):
     """
     A type for handling ops against nulls
     Exists so we can:
-    1. Teach numba that all occurances of `cudf.NA` are
+    1. Teach numba that all occurrences of `cudf.NA` are
        to be read as instances of this type instead
     2. Define ops like `if x is cudf.NA` where `x` is of
        type `Masked` to mean `if x.valid is False`
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index a48cfd07d3f..1f876214b16 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -37,7 +37,7 @@ def read_json(
         for source in path_or_buf:
             if ioutils.is_directory(source, **kwargs):
                 fs = ioutils._ensure_filesystem(
-                    passed_filesystem=None, path=source
+                    passed_filesystem=None, path=source, **kwargs
                 )
                 source = ioutils.stringify_pathlike(source)
                 source = fs.sep.join([source, "*.json"])
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index ecb1b0cd185..c1cce3f996f 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -316,7 +316,7 @@ def read_orc(
     for source in filepath_or_buffer:
         if ioutils.is_directory(source, **kwargs):
             fs = ioutils._ensure_filesystem(
-                passed_filesystem=None, path=source
+                passed_filesystem=None, path=source, **kwargs,
             )
             source = stringify_path(source)
             source = fs.sep.join([source, "*.orc"])
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 302021a082f..04d64969a16 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -12,6 +12,7 @@
 import cudf
 from cudf._lib import parquet as libparquet
 from cudf.api.types import is_list_like
+from cudf.core.column import as_column, build_categorical_column
 from cudf.utils import ioutils
 
 
@@ -80,7 +81,7 @@ def write_to_dataset(
         kwargs for to_parquet function.
     """
 
-    fs = ioutils._ensure_filesystem(fs, root_path)
+    fs = ioutils._ensure_filesystem(fs, root_path, **kwargs)
     fs.mkdirs(root_path, exist_ok=True)
     metadata = []
 
@@ -163,11 +164,19 @@ def read_parquet_metadata(path):
     return num_rows, num_row_groups, col_names
 
 
-def _process_row_groups(paths, fs, filters=None, row_groups=None):
+def _process_dataset(
+    paths, fs, filters=None, row_groups=None, categorical_partitions=True,
+):
+    # Returns:
+    #     file_list - Expanded/filtered list of paths
+    #     row_groups - Filtered list of row-group selections
+    #     partition_keys - list of partition keys for each file
+    #     partition_categories - Categories for each partition
 
     # The general purpose of this function is to (1) expand
     # directory input into a list of paths (using the pyarrow
-    # dataset API), and (2) to apply row-group filters.
+    # dataset API), (2) to apply row-group filters, and (3)
+    # to discover directory-partitioning information
 
     # Deal with case that the user passed in a directory name
     file_list = paths
@@ -186,28 +195,107 @@ def _process_row_groups(paths, fs, filters=None, row_groups=None):
     if len(file_list) == 0:
         raise FileNotFoundError(f"{paths} could not be resolved to any files")
 
-    if filters is not None:
-        # Load IDs of filtered row groups for each file in dataset
-        filtered_rg_ids = defaultdict(list)
-        for fragment in dataset.get_fragments(filter=filters):
-            for rg_fragment in fragment.split_by_row_group(filters):
-                for rg_info in rg_fragment.row_groups:
-                    filtered_rg_ids[rg_fragment.path].append(rg_info.id)
-
-        # Initialize row_groups to be selected
-        if row_groups is None:
-            row_groups = [None for _ in dataset.files]
-
-        # Store IDs of selected row groups for each file
-        for i, file in enumerate(dataset.files):
-            if row_groups[i] is None:
-                row_groups[i] = filtered_rg_ids[file]
-            else:
-                row_groups[i] = filter(
-                    lambda id: id in row_groups[i], filtered_rg_ids[file]
+    # Deal with directory partitioning
+    # Get all partition keys (without filters)
+    partition_categories = defaultdict(list)
+    file_fragment = None
+    for file_fragment in dataset.get_fragments():
+        keys = ds._get_partition_keys(file_fragment.partition_expression)
+        if not (keys or partition_categories):
+            # Bail - This is not a directory-partitioned dataset
+            break
+        for k, v in keys.items():
+            if v not in partition_categories[k]:
+                partition_categories[k].append(v)
+        if not categorical_partitions:
+            # Bail - We don't need to discover all categories.
+            # We only need to save the partition keys from this
+            # first `file_fragment`
+            break
+
+    if partition_categories and file_fragment is not None:
+        # Check/correct order of `categories` using last file_frag,
+        # because `_get_partition_keys` does NOT preserve the
+        # partition-hierarchy order of the keys.
+        cat_keys = [
+            part.split("=")[0]
+            for part in file_fragment.path.split(fs.sep)
+            if "=" in part
+        ]
+        if set(partition_categories) == set(cat_keys):
+            partition_categories = {
+                k: partition_categories[k]
+                for k in cat_keys
+                if k in partition_categories
+            }
+
+    # If we do not have partitioned data and
+    # are not filtering, we can return here
+    if filters is None and not partition_categories:
+        return file_list, row_groups, [], {}
+
+    # Record initial row_groups input
+    row_groups_map = {}
+    if row_groups is not None:
+        # Make sure paths and row_groups map 1:1
+        # and save the initial mapping
+        if len(paths) != len(file_list):
+            raise ValueError(
+                "Cannot specify a row_group selection for a directory path."
+            )
+        row_groups_map = {path: rgs for path, rgs in zip(paths, row_groups)}
+
+    # Apply filters and discover partition columns
+    partition_keys = []
+    if partition_categories or filters is not None:
+        file_list = []
+        if filters is not None:
+            row_groups = []
+        for file_fragment in dataset.get_fragments(filter=filters):
+            path = file_fragment.path
+
+            # Extract hive-partition keys, and make sure they
+            # are orederd the same as they are in `partition_categories`
+            if partition_categories:
+                raw_keys = ds._get_partition_keys(
+                    file_fragment.partition_expression
+                )
+                partition_keys.append(
+                    [
+                        (name, raw_keys[name])
+                        for name in partition_categories.keys()
+                    ]
                 )
 
-    return file_list, row_groups
+            # Apply row-group filtering
+            selection = row_groups_map.get(path, None)
+            if selection is not None or filters is not None:
+                filtered_row_groups = [
+                    rg_info.id
+                    for rg_fragment in file_fragment.split_by_row_group(
+                        filters, schema=dataset.schema,
+                    )
+                    for rg_info in rg_fragment.row_groups
+                ]
+            file_list.append(path)
+            if filters is not None:
+                if selection is None:
+                    row_groups.append(filtered_row_groups)
+                else:
+                    row_groups.append(
+                        [
+                            rg_id
+                            for rg_id in filtered_row_groups
+                            if rg_id in selection
+                        ]
+                    )
+
+    return (
+        file_list,
+        row_groups,
+        partition_keys,
+        partition_categories if categorical_partitions else {},
+    )
 
 
 def _get_byte_ranges(file_list, row_groups, columns, fs, **kwargs):
@@ -319,6 +407,7 @@ def read_parquet(
     strings_to_categorical=False,
     use_pandas_metadata=True,
     use_python_file_object=False,
+    categorical_partitions=True,
     *args,
     **kwargs,
 ):
@@ -345,17 +434,29 @@ def read_parquet(
     # Start by trying construct a filesystem object, so we
     # can apply filters on remote file-systems
     fs, paths = ioutils._get_filesystem_and_paths(filepath_or_buffer, **kwargs)
-    filepath_or_buffer = paths if paths else filepath_or_buffer
-    if fs is None and filters is not None:
-        raise ValueError("cudf cannot apply filters to open file objects.")
 
-    # Apply filters now (before converting non-local paths to buffers).
-    # Note that `_process_row_groups` will also expand `filepath_or_buffer`
-    # into a full list of files if it is a directory.
-    if fs is not None:
-        filepath_or_buffer, row_groups = _process_row_groups(
-            filepath_or_buffer, fs, filters=filters, row_groups=row_groups,
+    # Use pyarrow dataset to detect/process directory-partitioned
+    # data and apply filters. Note that we can only support partitioned
+    # data and filtering if the input is a single directory or list of
+    # paths.
+    partition_keys = []
+    partition_categories = {}
+    if fs and paths:
+        (
+            paths,
+            row_groups,
+            partition_keys,
+            partition_categories,
+        ) = _process_dataset(
+            paths,
+            fs,
+            filters=filters,
+            row_groups=row_groups,
+            categorical_partitions=categorical_partitions,
         )
+    elif filters is not None:
+        raise ValueError("cudf cannot apply filters to open file objects.")
+    filepath_or_buffer = paths if paths else filepath_or_buffer
 
     # Check if we should calculate the specific byte-ranges
     # needed for each parquet file. We always do this when we
@@ -380,15 +481,6 @@ def read_parquet(
     filepaths_or_buffers = []
     for i, source in enumerate(filepath_or_buffer):
 
-        if ioutils.is_directory(source, **kwargs):
-            # Note: For now, we know `fs` is an fsspec filesystem
-            # object, but it may be an arrow object in the future
-            fsspec_fs = ioutils._ensure_filesystem(
-                passed_filesystem=fs, path=source
-            )
-            source = ioutils.stringify_pathlike(source)
-            source = fsspec_fs.sep.join([source, "*.parquet"])
-
         tmp_source, compression = ioutils.get_filepath_or_buffer(
             path_or_data=source,
             compression=None,
@@ -410,6 +502,117 @@ def read_parquet(
         else:
             filepaths_or_buffers.append(tmp_source)
 
+    # Warn user if they are not using cudf for IO
+    # (There is a good chance this was not the intention)
+    if engine != "cudf":
+        warnings.warn(
+            "Using CPU via PyArrow to read Parquet dataset."
+            "This option is both inefficient and unstable!"
+        )
+        if filters is not None:
+            warnings.warn(
+                "Parquet row-group filtering is only supported with "
+                "'engine=cudf'. Use pandas or pyarrow API directly "
+                "for full CPU-based filtering functionality."
+            )
+
+    return _parquet_to_frame(
+        filepaths_or_buffers,
+        engine,
+        *args,
+        columns=columns,
+        row_groups=row_groups,
+        skiprows=skiprows,
+        num_rows=num_rows,
+        strings_to_categorical=strings_to_categorical,
+        use_pandas_metadata=use_pandas_metadata,
+        partition_keys=partition_keys,
+        partition_categories=partition_categories,
+        **kwargs,
+    )
+
+
+def _parquet_to_frame(
+    paths_or_buffers,
+    *args,
+    row_groups=None,
+    partition_keys=None,
+    partition_categories=None,
+    **kwargs,
+):
+
+    # If this is not a partitioned read, only need
+    # one call to `_read_parquet`
+    if not partition_keys:
+        return _read_parquet(
+            paths_or_buffers, *args, row_groups=row_groups, **kwargs,
+        )
+
+    # For partitioned data, we need a distinct read for each
+    # unique set of partition keys. Therefore, we start by
+    # aggregating all paths with matching keys using a dict
+    plan = {}
+    for i, (keys, path) in enumerate(zip(partition_keys, paths_or_buffers)):
+        rgs = row_groups[i] if row_groups else None
+        tkeys = tuple(keys)
+        if tkeys in plan:
+            plan[tkeys][0].append(path)
+            if rgs is not None:
+                plan[tkeys][1].append(rgs)
+        else:
+            plan[tkeys] = ([path], None if rgs is None else [rgs])
+
+    dfs = []
+    for part_key, (key_paths, key_row_groups) in plan.items():
+        # Add new DataFrame to our list
+        dfs.append(
+            _read_parquet(
+                key_paths, *args, row_groups=key_row_groups, **kwargs,
+            )
+        )
+        # Add partition columns to the last DataFrame
+        for (name, value) in part_key:
+            if partition_categories and name in partition_categories:
+                # Build the categorical column from `codes`
+                codes = as_column(
+                    partition_categories[name].index(value),
+                    length=len(dfs[-1]),
+                )
+                dfs[-1][name] = build_categorical_column(
+                    categories=partition_categories[name],
+                    codes=codes,
+                    size=codes.size,
+                    offset=codes.offset,
+                    ordered=False,
+                )
+            else:
+                # Not building categorical columns, so
+                # `value` is already what we want
+                dfs[-1][name] = as_column(value, length=len(dfs[-1]))
+
+    # Concatenate dfs and return.
+    # Assume we can ignore the index if it has no name.
+    return (
+        cudf.concat(dfs, ignore_index=dfs[-1].index.name is None)
+        if len(dfs) > 1
+        else dfs[0]
+    )
+
+
+def _read_parquet(
+    filepaths_or_buffers,
+    engine,
+    columns=None,
+    row_groups=None,
+    skiprows=None,
+    num_rows=None,
+    strings_to_categorical=None,
+    use_pandas_metadata=None,
+    *args,
+    **kwargs,
+):
+    # Simple helper function to dispatch between
+    # cudf and pyarrow to read parquet data
     if engine == "cudf":
         return libparquet.read_parquet(
             filepaths_or_buffers,
@@ -421,7 +624,6 @@ def read_parquet(
             use_pandas_metadata=use_pandas_metadata,
         )
     else:
-        warnings.warn("Using CPU via PyArrow to read Parquet dataset.")
         return cudf.DataFrame.from_arrow(
             pq.ParquetDataset(filepaths_or_buffers).read_pandas(
                 columns=columns, *args, **kwargs
@@ -441,6 +643,8 @@ def to_parquet(
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
+    row_group_size_bytes=None,
+    row_group_size_rows=None,
     *args,
     **kwargs,
 ):
@@ -480,6 +684,8 @@ def to_parquet(
                     statistics=statistics,
                     metadata_file_path=metadata_file_path,
                     int96_timestamps=int96_timestamps,
+                    row_group_size_bytes=row_group_size_bytes,
+                    row_group_size_rows=row_group_size_rows,
                 )
         else:
             write_parquet_res = libparquet.write_parquet(
@@ -490,6 +696,8 @@ def to_parquet(
                 statistics=statistics,
                 metadata_file_path=metadata_file_path,
                 int96_timestamps=int96_timestamps,
+                row_group_size_bytes=row_group_size_bytes,
+                row_group_size_rows=row_group_size_rows,
             )
 
         return write_parquet_res
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 9562fca7399..59c291eea0b 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -410,7 +410,7 @@ def assert_series_equal(
         Whether to check the Index class, dtype and inferred_type
         are identical.
     check_series_type : bool, default True
-        Whether to check the seires class, dtype and
+        Whether to check the series class, dtype and
         inferred_type are identical. Currently it is idle,
         and similar to pandas.
     check_less_precise : bool or int, default False
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 542dcd9301c..ba2a6dce369 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1173,7 +1173,7 @@ def make_scalar_product_data():
         )
     )
 
-    # we can muliply any timedelta by any int, or bool
+    # we can multiply any timedelta by any int, or bool
     valid |= set(product(TIMEDELTA_TYPES, INTEGER_TYPES | BOOL_TYPES))
 
     # we can multiply a float by any int, float, or bool
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index bb96f3c4290..46707a283af 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -576,7 +576,7 @@ def test_concat_empty_dataframes(df, other, ignore_index):
     if expected.shape != df.shape:
         for key, col in actual[actual.columns].iteritems():
             if is_categorical_dtype(col.dtype):
-                if expected[key].dtype != "category":
+                if not is_categorical_dtype(expected[key].dtype):
                     # TODO: Pandas bug:
                     # https://github.com/pandas-dev/pandas/issues/42840
                     expected[key] = expected[key].fillna("-1").astype("str")
@@ -1186,7 +1186,7 @@ def test_concat_join_empty_dataframes(
         if axis == 0:
             for key, col in actual[actual.columns].iteritems():
                 if is_categorical_dtype(col.dtype):
-                    if expected[key].dtype != "category":
+                    if not is_categorical_dtype(expected[key].dtype):
                         # TODO: Pandas bug:
                         # https://github.com/pandas-dev/pandas/issues/42840
                         expected[key] = (
diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py
index 16e5b345ce2..bfd2ccbccef 100644
--- a/python/cudf/cudf/tests/test_custom_accessor.py
+++ b/python/cudf/cudf/tests/test_custom_accessor.py
@@ -44,7 +44,7 @@ def test_dataframe_accessor(gdf):
     "gdf2", [gd.datasets.randomdata(nrows=1, dtypes={"x": int, "y": int})]
 )
 def test_dataframe_accessor_idendity(gdf1, gdf2):
-    """Test for accessor idendities
+    """Test for accessor identities
     - An object should hold persistent reference to the same accessor
     - Different objects should hold difference instances of the accessor
     """
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d07caef11d5..d555b5c4033 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8924,3 +8924,118 @@ def test_frame_series_where_other(data):
     expected = gdf.where(gdf["b"] == 1, 0)
     actual = pdf.where(pdf["b"] == 1, 0)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "data, gkey",
+    [
+        (
+            {
+                "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+                "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+                "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+                "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+            },
+            ["id", "val1", "val2"],
+        ),
+        (
+            {
+                "id": [0] * 4 + [1] * 3,
+                "a": [10, 3, 4, 2, -3, 9, 10],
+                "b": [10, 23, -4, 2, -3, 9, 19],
+            },
+            ["id", "a"],
+        ),
+        (
+            {
+                "id": ["a", "a", "b", "b", "c", "c"],
+                "val": [None, None, None, None, None, None],
+            },
+            ["id"],
+        ),
+        (
+            {
+                "id": ["a", "a", "b", "b", "c", "c"],
+                "val1": [None, 4, 6, 8, None, 2],
+                "val2": [4, 5, None, 2, 9, None],
+            },
+            ["id"],
+        ),
+        ({"id": [1.0], "val1": [2.0], "val2": [3.0]}, ["id"]),
+    ],
+)
+@pytest.mark.parametrize(
+    "min_per", [0, 1, 2, 3, 4],
+)
+def test_pearson_corr_passing(data, gkey, min_per):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    actual = gdf.groupby(gkey).corr(method="pearson", min_periods=min_per)
+    expected = pdf.groupby(gkey).corr(method="pearson", min_periods=min_per)
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("method", ["kendall", "spearman"])
+def test_pearson_corr_unsupported_methods(method):
+    gdf = cudf.DataFrame(
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+            "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+            "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        }
+    )
+
+    with pytest.raises(
+        NotImplementedError,
+        match="Only pearson correlation is currently supported",
+    ):
+        gdf.groupby("id").corr(method)
+
+
+def test_pearson_corr_empty_columns():
+    gdf = cudf.DataFrame(columns=["id", "val1", "val2"])
+    pdf = gdf.to_pandas()
+
+    actual = gdf.groupby("id").corr("pearson")
+    expected = pdf.groupby("id").corr("pearson")
+
+    assert_eq(
+        expected, actual, check_dtype=False, check_index_type=False,
+    )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"],
+            "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"],
+        },
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": [1, 1, 1, 2, 2, 2, 3, 3, 3],
+            "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"],
+        },
+    ],
+)
+@pytest.mark.parametrize("gkey", ["id", "val1", "val2"])
+def test_pearson_corr_invalid_column_types(data, gkey):
+    with pytest.raises(
+        TypeError, match="Correlation accepts only numerical column-pairs",
+    ):
+        cudf.DataFrame(data).groupby(gkey).corr("pearson")
+
+
+def test_pearson_corr_multiindex_dataframe():
+    gdf = cudf.DataFrame(
+        {"a": [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [2, 3, 4, 5]}
+    ).set_index(["a", "b"])
+
+    actual = gdf.groupby(level="a").corr("pearson")
+    expected = gdf.to_pandas().groupby(level="a").corr("pearson")
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index d666dfc0ec1..a95be4f7932 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -171,7 +171,7 @@ def test_dt_ops(data):
     assert_eq(pd_data > pd_data, gdf_data > gdf_data)
 
 
-# libgdf doesn't respect timezones
+# libcudf doesn't respect timezones
 @pytest.mark.parametrize("data", [data1()])
 @pytest.mark.parametrize("field", fields)
 def test_dt_series(data, field):
@@ -1777,12 +1777,45 @@ def test_error_values():
     ],
 )
 @pytest.mark.parametrize("time_type", DATETIME_TYPES)
-@pytest.mark.parametrize("resolution", ["D", "H", "T", "S", "L", "U", "N"])
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
 def test_ceil(data, time_type, resolution):
 
-    ps = pd.Series(data, dtype=time_type)
-    gs = cudf.from_pandas(ps)
+    gs = cudf.Series(data, dtype=time_type)
+    ps = gs.to_pandas()
 
     expect = ps.dt.ceil(resolution)
     got = gs.dt.ceil(resolution)
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        (
+            [
+                "2020-05-31 08:00:00",
+                "1999-12-31 18:40:10",
+                "2000-12-31 04:00:05",
+                "1900-02-28 07:00:06",
+                "1800-03-14 07:30:20",
+                "2100-03-14 07:30:20",
+                "1970-01-01 00:00:09",
+                "1969-12-31 12:59:10",
+            ]
+        )
+    ],
+)
+@pytest.mark.parametrize("time_type", DATETIME_TYPES)
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
+def test_floor(data, time_type, resolution):
+
+    gs = cudf.Series(data, dtype=time_type)
+    ps = gs.to_pandas()
+
+    expect = ps.dt.floor(resolution)
+    got = gs.dt.floor(resolution)
+    assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
new file mode 100644
index 00000000000..d24c8ca2860
--- /dev/null
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -0,0 +1,219 @@
+from typing import Any, Tuple
+
+import cupy as cp
+import pandas as pd
+import pytest
+
+import cudf
+from cudf.core.buffer import Buffer
+from cudf.core.column import build_column
+from cudf.core.df_protocol import (
+    DataFrameObject,
+    _CuDFBuffer,
+    _CuDFColumn,
+    _DtypeKind,
+    _from_dataframe,
+    protocol_dtype_to_cupy_dtype,
+)
+from cudf.testing._utils import assert_eq
+
+
+def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
+    buf, dtype = buffer_and_dtype
+    device_id = cp.asarray(cudfcol.data).device.id
+    assert buf.__dlpack_device__() == (2, device_id)
+    col_from_buf = build_column(
+        Buffer(buf.ptr, buf.bufsize), protocol_dtype_to_cupy_dtype(dtype)
+    )
+    # check that non null values are the equals as nulls are represented
+    # by sentinel values in the buffer.
+    non_null_idxs = cudf.Series(cudfcol) != cudf.NA
+    assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])
+
+    if dtype[0] != _DtypeKind.BOOL:
+        array_from_dlpack = cp.fromDlpack(buf.__dlpack__())
+        col_array = cp.asarray(cudfcol.data_array_view)
+        assert_eq(array_from_dlpack.flatten(), col_array.flatten())
+    else:
+        pytest.raises(TypeError, buf.__dlpack__)
+
+
+def assert_column_equal(col: _CuDFColumn, cudfcol):
+    assert col.size == cudfcol.size
+    assert col.offset == 0
+    assert col.null_count == cudfcol.null_count
+    assert col.num_chunks() == 1
+    if col.null_count == 0:
+        pytest.raises(RuntimeError, col._get_validity_buffer)
+        assert col.get_buffers()["validity"] is None
+    else:
+        assert_buffer_equal(
+            col.get_buffers()["validity"],
+            cudfcol._get_mask_as_column().astype(cp.uint8),
+        )
+
+    if col.dtype[0] == _DtypeKind.CATEGORICAL:
+        assert_buffer_equal(col.get_buffers()["data"], cudfcol.codes)
+        assert col.get_buffers()["offsets"] is None
+
+    elif col.dtype[0] == _DtypeKind.STRING:
+        assert_buffer_equal(col.get_buffers()["data"], cudfcol.children[1])
+        assert_buffer_equal(col.get_buffers()["offsets"], cudfcol.children[0])
+
+    else:
+        assert_buffer_equal(col.get_buffers()["data"], cudfcol)
+        assert col.get_buffers()["offsets"] is None
+
+    if col.null_count == 0:
+        assert col.describe_null == (0, None)
+    else:
+        assert col.describe_null == (3, 0)
+
+
+def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame):
+    assert dfo.num_columns() == len(df.columns)
+    assert dfo.num_rows() == len(df)
+    assert dfo.num_chunks() == 1
+    assert dfo.column_names() == list(df.columns)
+    for col in df.columns:
+        assert_column_equal(dfo.get_column_by_name(col), df[col]._column)
+
+
+def assert_from_dataframe_equals(dfobj):
+    df2 = _from_dataframe(dfobj)
+
+    assert_dataframe_equal(dfobj, df2)
+    if isinstance(dfobj._df, cudf.DataFrame):
+        assert_eq(dfobj._df, df2)
+
+    elif isinstance(dfobj._df, pd.DataFrame):
+        assert_eq(cudf.DataFrame(dfobj._df), df2)
+
+    else:
+        raise TypeError(f"{type(dfobj._df)} not supported yet.")
+
+
+def assert_from_dataframe_exception(dfobj):
+    exception_msg = "This operation must copy data from CPU to GPU."
+    " Set `allow_copy=True` to allow it."
+    with pytest.raises(TypeError, match=exception_msg):
+        _from_dataframe(dfobj)
+
+
+def assert_df_unique_dtype_cols(data):
+    cdf = cudf.DataFrame(data=data)
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
+
+
+def test_from_dataframe():
+    data = dict(a=[1, 2, 3], b=[9, 10, 11])
+    df1 = cudf.DataFrame(data=data)
+    df2 = cudf.from_dataframe(df1)
+    assert_eq(df1, df2)
+
+
+def test_int_dtype():
+    data_int = dict(a=[1, 2, 3], b=[9, 10, 11])
+    assert_df_unique_dtype_cols(data_int)
+
+
+def test_float_dtype():
+    data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8])
+    assert_df_unique_dtype_cols(data_float)
+
+
+def test_categorical_dtype():
+    cdf = cudf.DataFrame({"A": [1, 2, 5, 1]})
+    cdf["A"] = cdf["A"].astype("category")
+    col = cdf.__dataframe__().get_column_by_name("A")
+    assert col.dtype[0] == _DtypeKind.CATEGORICAL
+    assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
+
+
+def test_bool_dtype():
+    data_bool = dict(a=[True, True, False], b=[False, True, False])
+    assert_df_unique_dtype_cols(data_bool)
+
+
+def test_string_dtype():
+    data_string = dict(a=["a", "b", "cdef", "", "g"])
+    assert_df_unique_dtype_cols(data_string)
+
+
+def test_mixed_dtype():
+    data_mixed = dict(
+        int=[1, 2, 3],
+        float=[1.5, 2.5, 3.5],
+        bool=[True, False, True],
+        categorical=[5, 1, 5],
+        string=["rapidsai-cudf ", "", "df protocol"],
+    )
+    assert_df_unique_dtype_cols(data_mixed)
+
+
+def test_NA_int_dtype():
+    data_int = dict(
+        a=[1, None, 3, None, 5],
+        b=[9, 10, None, 7, 8],
+        c=[6, 19, 20, 100, 1000],
+    )
+    assert_df_unique_dtype_cols(data_int)
+
+
+def test_NA_float_dtype():
+    data_float = dict(
+        a=[1.4, None, 3.6, None, 5.2],
+        b=[9.7, 10.9, None, 7.8, 8.2],
+        c=[6.1, 19.2, 20.3, 100.4, 1000.5],
+    )
+    assert_df_unique_dtype_cols(data_float)
+
+
+def test_NA_categorical_dtype():
+    df = cudf.DataFrame({"A": [1, 2, 5, 1]})
+    df["B"] = df["A"].astype("category")
+    df.at[[1, 3], "B"] = None  # Set two items to null
+
+    # Some detailed testing for correctness of dtype and null handling:
+    col = df.__dataframe__().get_column_by_name("B")
+    assert col.dtype[0] == _DtypeKind.CATEGORICAL
+    assert col.null_count == 2
+    assert col.describe_null == (3, 0)
+    assert col.num_chunks() == 1
+    assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=True))
+
+
+def test_NA_bool_dtype():
+    data_bool = dict(a=[None, True, False], b=[False, None, None])
+    assert_df_unique_dtype_cols(data_bool)
+
+
+def test_NA_string_dtype():
+    df = cudf.DataFrame({"A": ["a", "b", "cdef", "", "g"]})
+    df["B"] = df["A"].astype("object")
+    df.at[1, "B"] = cudf.NA  # Set one item to null
+
+    # Test for correctness and null handling:
+    col = df.__dataframe__().get_column_by_name("B")
+    assert col.dtype[0] == _DtypeKind.STRING
+    assert col.null_count == 1
+    assert col.describe_null == (3, 0)
+    assert col.num_chunks() == 1
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=True))
+
+
+def test_NA_mixed_dtype():
+    data_mixed = dict(
+        int=[1, None, 2, 3, 1000],
+        float=[None, 1.5, 2.5, 3.5, None],
+        bool=[True, None, False, None, None],
+        categorical=[5, 1, 5, 3, None],
+        string=[None, None, None, "df protocol", None],
+    )
+    assert_df_unique_dtype_cols(data_mixed)
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index 39fa7b11ce2..47c9448cf63 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -1,5 +1,6 @@
 import operator
 
+import cupy as cp
 import pytest
 from numba import cuda, types
 from numba.cuda import compile_ptx
@@ -71,8 +72,8 @@ def test_execute_masked_binary(op, ty):
     def func(x, y):
         return op(x, y)
 
-    @cuda.jit(debug=True)
-    def test_kernel(x, y):
+    @cuda.jit
+    def test_kernel(x, y, err):
         # Reference result with unmasked value
         u = func(x, y)
 
@@ -87,14 +88,22 @@ def test_kernel(x, y):
         # Check masks are as expected, and unmasked result matches masked
         # result
         if r0.valid:
-            raise RuntimeError("Expected r0 to be invalid")
+            # TODO: ideally, we would raise an exception here rather
+            # than return an "error code", and that is what the
+            # previous version of this (and below) tests did. But,
+            # Numba kernels cannot currently use `debug=True` with
+            # CUDA enhanced compatibility.  Once a solution to that is
+            # reached, we should switch back to raising exceptions
+            # here.
+            err[0] = 1
         if not r1.valid:
-            raise RuntimeError("Expected r1 to be valid")
+            err[0] = 2
         if u != r1.value:
-            print("Values: ", u, r1.value)
-            raise RuntimeError("u != r1.value")
+            err[0] = 3
 
-    test_kernel[1, 1](1, 2)
+    err = cp.asarray([0], dtype="int8")
+    test_kernel[1, 1](1, 2, err)
+    assert err[0] == 0
 
 
 @pytest.mark.parametrize("op", ops)
@@ -187,18 +196,20 @@ def test_is_na(fn):
 
     device_fn = cuda.jit(device=True)(fn)
 
-    @cuda.jit(debug=True)
-    def test_kernel():
+    @cuda.jit
+    def test_kernel(err):
         valid_is_na = device_fn(valid)
         invalid_is_na = device_fn(invalid)
 
         if valid_is_na:
-            raise RuntimeError("Valid masked value is NA and should not be")
+            err[0] = 1
 
         if not invalid_is_na:
-            raise RuntimeError("Invalid masked value is not NA and should be")
+            err[0] = 2
 
-    test_kernel[1, 1]()
+    err = cp.asarray([0], dtype="int8")
+    test_kernel[1, 1](err)
+    assert err[0] == 0
 
 
 def func_lt_na(x):
@@ -271,8 +282,8 @@ def test_na_masked_comparisons(fn, ty):
 
     device_fn = cuda.jit(device=True)(fn)
 
-    @cuda.jit(debug=True)
-    def test_kernel():
+    @cuda.jit
+    def test_kernel(err):
         unmasked = ty(1)
         valid_masked = Masked(unmasked, True)
         invalid_masked = Masked(unmasked, False)
@@ -281,12 +292,14 @@ def test_kernel():
         invalid_cmp_na = device_fn(invalid_masked)
 
         if valid_cmp_na:
-            raise RuntimeError("Valid masked value compared True with NA")
+            err[0] = 1
 
         if invalid_cmp_na:
-            raise RuntimeError("Invalid masked value compared True with NA")
+            err[0] = 2
 
-    test_kernel[1, 1]()
+    err = cp.asarray([0], dtype="int8")
+    test_kernel[1, 1](err)
+    assert err[0] == 0
 
 
 # xfail because scalars do not yet cast for a comparison to NA
@@ -297,13 +310,15 @@ def test_na_scalar_comparisons(fn, ty):
 
     device_fn = cuda.jit(device=True)(fn)
 
-    @cuda.jit(debug=True)
-    def test_kernel():
+    @cuda.jit
+    def test_kernel(err):
         unmasked = ty(1)
 
         unmasked_cmp_na = device_fn(unmasked)
 
         if unmasked_cmp_na:
-            raise RuntimeError("Unmasked value compared True with NA")
+            err[0] = 1
 
-    test_kernel[1, 1]()
+    err = cp.asarray([0], dtype="int8")
+    test_kernel[1, 1](err)
+    assert err[0] == 0
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index c6cf7c4e6f5..ab211616a02 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2470,3 +2470,29 @@ def test_index_type_methods(data, func):
         assert_eq(False, actual)
     else:
         assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
+def test_index_datetime_ceil(resolution):
+    cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000])
+    pidx = cuidx.to_pandas()
+
+    pidx_ceil = pidx.ceil(resolution)
+    cuidx_ceil = cuidx.ceil(resolution)
+
+    assert_eq(pidx_ceil, cuidx_ceil)
+
+
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
+def test_index_datetime_floor(resolution):
+    cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000])
+    pidx = cuidx.to_pandas()
+
+    pidx_floor = pidx.floor(resolution)
+    cuidx_floor = cuidx.floor(resolution)
+
+    assert_eq(pidx_floor, cuidx_floor)
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index e9f55c9e51a..d25c6130bfb 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -230,10 +230,7 @@ def test_dataframe_join_combine_cats():
     expect.index = expect.index.astype("category")
     got = lhs.join(rhs, how="outer")
 
-    # TODO: Remove copying to host
-    # after https://github.com/rapidsai/cudf/issues/5676
-    # is implemented
-    assert_eq(expect.index.sort_values(), got.index.to_pandas().sort_values())
+    assert_eq(expect.index.sort_values(), got.index.sort_values())
 
 
 @pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
@@ -744,12 +741,6 @@ def test_merge_sort(ons, hows):
     [
         {"left_on": ["a"], "left_index": False, "right_index": True},
         {"right_on": ["b"], "left_index": True, "right_index": False},
-        {
-            "left_on": ["a"],
-            "right_on": ["b"],
-            "left_index": True,
-            "right_index": True,
-        },
     ],
 )
 def test_merge_sort_on_indexes(kwargs):
@@ -1791,12 +1782,6 @@ def test_typecast_on_join_indexes_matching_categorical():
         {"left_index": True, "right_on": "b"},
         {"left_on": "a", "right_index": True},
         {"left_index": True, "right_index": True},
-        {
-            "left_on": "a",
-            "right_on": "b",
-            "left_index": True,
-            "right_index": True,
-        },
     ],
 )
 def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs):
@@ -2148,3 +2133,33 @@ def test_join_on_index_with_duplicate_names():
     got = lhs.join(rhs, how="inner")
 
     assert_join_results_equal(expect, got, how="inner")
+
+
+def test_join_redundant_params():
+    lhs = cudf.DataFrame(
+        {"a": [1, 2, 3], "c": [2, 3, 4]}, index=cudf.Index([0, 1, 2], name="c")
+    )
+    rhs = cudf.DataFrame(
+        {"b": [1, 2, 3]}, index=cudf.Index([0, 1, 2], name="a")
+    )
+    with pytest.raises(ValueError):
+        lhs.merge(rhs, on="a", left_index=True)
+    with pytest.raises(ValueError):
+        lhs.merge(rhs, left_on="a", left_index=True, right_index=True)
+    with pytest.raises(ValueError):
+        lhs.merge(rhs, right_on="a", left_index=True, right_index=True)
+    with pytest.raises(ValueError):
+        lhs.merge(rhs, left_on="c", right_on="b")
+
+
+def test_join_multiindex_index():
+    # test joining a MultiIndex with an Index with overlapping name
+    lhs = (
+        cudf.DataFrame({"a": [2, 3, 1], "b": [3, 4, 2]})
+        .set_index(["a", "b"])
+        .index
+    )
+    rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index("a").index
+    expect = lhs.to_pandas().join(rhs.to_pandas(), how="inner")
+    got = lhs.join(rhs, how="inner")
+    assert_join_results_equal(expect, got, how="inner")
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index d409a099806..07407b8d359 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -738,9 +738,9 @@ def test_multiindex_copy_sem(data, levels, codes, names):
 )
 @pytest.mark.parametrize("deep", [True, False])
 def test_multiindex_copy_deep(data, deep):
-    """Test memory idendity for deep copy
+    """Test memory identity for deep copy
     Case1: Constructed from GroupBy, StringColumns
-    Case2: Constrcuted from MultiIndex, NumericColumns
+    Case2: Constructed from MultiIndex, NumericColumns
     """
     same_ref = not deep
 
@@ -768,19 +768,19 @@ def test_multiindex_copy_deep(data, deep):
         mi1 = data
         mi2 = mi1.copy(deep=deep)
 
-        # Assert ._levels idendity
+        # Assert ._levels identity
         lptrs = [lv._data._data[None].base_data.ptr for lv in mi1._levels]
         rptrs = [lv._data._data[None].base_data.ptr for lv in mi2._levels]
 
         assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
 
-        # Assert ._codes idendity
+        # Assert ._codes identity
         lptrs = [c.base_data.ptr for _, c in mi1._codes._data.items()]
         rptrs = [c.base_data.ptr for _, c in mi2._codes._data.items()]
 
         assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
 
-        # Assert ._data idendity
+        # Assert ._data identity
         lptrs = [d.base_data.ptr for _, d in mi1._data.items()]
         rptrs = [d.base_data.ptr for _, d in mi2._data.items()]
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 99b5652110b..dc176992434 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -502,7 +502,7 @@ def test_orc_writer_sliced(tmpdir):
         "TestOrcFile.decimal.orc",
         "TestOrcFile.decimal.same.values.orc",
         "TestOrcFile.decimal.multiple.values.orc",
-        # For addional information take look at PR 7034
+        # For additional information take look at PR 7034
         "TestOrcFile.decimal.runpos.issue.orc",
     ],
 )
@@ -541,7 +541,7 @@ def test_orc_decimal_precision_fail(datadir):
     assert_eq(pdf, gdf)
 
 
-# For addional information take look at PR 6636 and 6702
+# For additional information take look at PR 6636 and 6702
 @pytest.mark.parametrize(
     "orc_file",
     [
@@ -1526,3 +1526,18 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir):
     # Segfaults when RLE stream sizes don't account for varint length
     pa_out = pa.orc.ORCFile(reencoded).read()
     assert_eq(df.to_pandas(), pa_out)
+
+
+def test_empty_columns():
+    buffer = BytesIO()
+    # string and decimal columns have additional steps that need to be skipped
+    expected = cudf.DataFrame(
+        {
+            "string": cudf.Series([], dtype="str"),
+            "decimal": cudf.Series([], dtype=cudf.Decimal64Dtype(10, 1)),
+        }
+    )
+    expected.to_orc(buffer, compression="snappy")
+
+    got_df = cudf.read_orc(buffer)
+    assert_eq(expected, got_df)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index df31738050b..516ee0d17d3 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -231,6 +231,11 @@ def _make_parquet_path_or_buf(src):
     yield _make_parquet_path_or_buf
 
 
+@pytest.fixture(scope="module")
+def large_int64_gdf():
+    return cudf.DataFrame.from_pandas(pd.DataFrame({"col": range(0, 1 << 20)}))
+
+
 @pytest.mark.filterwarnings("ignore:Using CPU")
 @pytest.mark.parametrize("engine", ["pyarrow", "cudf"])
 @pytest.mark.parametrize(
@@ -1573,7 +1578,7 @@ def test_parquet_writer_bytes_io(simple_gdf):
 
 @pytest.mark.parametrize("filename", ["myfile.parquet", None])
 @pytest.mark.parametrize("cols", [["b"], ["c", "b"]])
-def test_parquet_write_partitioned(tmpdir_factory, cols, filename):
+def test_parquet_partitioned(tmpdir_factory, cols, filename):
     # Checks that write_to_dataset is wrapping to_parquet
     # as expected
     gdf_dir = str(tmpdir_factory.mktemp("gdf_dir"))
@@ -1592,10 +1597,14 @@ def test_parquet_write_partitioned(tmpdir_factory, cols, filename):
         gdf_dir, index=False, partition_cols=cols, partition_file_name=filename
     )
 
-    # Use pandas since dataset may be partitioned
-    expect = pd.read_parquet(pdf_dir)
-    got = pd.read_parquet(gdf_dir)
-    assert_eq(expect, got)
+    # Read back with pandas to compare
+    expect_pd = pd.read_parquet(pdf_dir)
+    got_pd = pd.read_parquet(gdf_dir)
+    assert_eq(expect_pd, got_pd)
+
+    # Check that cudf and pd return the same read
+    got_cudf = cudf.read_parquet(gdf_dir)
+    assert_eq(got_pd, got_cudf)
 
     # If filename is specified, check that it is correct
     if filename:
@@ -1624,9 +1633,9 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
     gdf.to_parquet(dir1, partition_cols=cols)
     cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols)
 
-    # cudf read_parquet cannot handle partitioned dataset
-    expect = pd.read_parquet(dir1)
-    got = pd.read_parquet(dir2)
+    # Read back with cudf
+    expect = cudf.read_parquet(dir1)
+    got = cudf.read_parquet(dir2)
     assert_eq(expect, got)
 
     gdf = cudf.DataFrame(
@@ -1640,6 +1649,80 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
         gdf.to_parquet(dir1, partition_cols=cols)
 
 
+@pytest.mark.parametrize(
+    "pfilters", [[("b", "==", "b")], [("b", "==", "a"), ("c", "==", 1)]],
+)
+@pytest.mark.parametrize("selection", ["directory", "files", "row-groups"])
+@pytest.mark.parametrize("use_cat", [True, False])
+def test_read_parquet_partitioned_filtered(
+    tmpdir, pfilters, selection, use_cat
+):
+    path = str(tmpdir)
+    size = 100
+    df = cudf.DataFrame(
+        {
+            "a": np.arange(0, stop=size, dtype="int64"),
+            "b": np.random.choice(list("abcd"), size=size),
+            "c": np.random.choice(np.arange(4), size=size),
+        }
+    )
+    df.to_parquet(path, partition_cols=["c", "b"])
+
+    if selection == "files":
+        # Pass in a list of paths
+        fs = get_fs_token_paths(path)[0]
+        read_path = fs.find(path)
+        row_groups = None
+    elif selection == "row-groups":
+        # Pass in a list of paths AND row-group ids
+        fs = get_fs_token_paths(path)[0]
+        read_path = fs.find(path)
+        row_groups = [[0] for p in read_path]
+    else:
+        # Pass in a directory path
+        # (row-group selection not allowed in this case)
+        read_path = path
+        row_groups = None
+
+    # Filter on partitioned columns
+    expect = pd.read_parquet(read_path, filters=pfilters)
+    got = cudf.read_parquet(
+        read_path,
+        filters=pfilters,
+        row_groups=row_groups,
+        categorical_partitions=use_cat,
+    )
+    if use_cat:
+        assert got.dtypes["b"] == "category"
+        assert got.dtypes["c"] == "category"
+    else:
+        # Check that we didn't get categorical
+        # columns, but convert back to categorical
+        # for comparison with pandas
+        assert got.dtypes["b"] == "object"
+        assert got.dtypes["c"] == "int"
+        got["b"] = pd.Categorical(
+            got["b"].to_pandas(), categories=list("abcd")
+        )
+        got["c"] = pd.Categorical(
+            got["c"].to_pandas(), categories=np.arange(4)
+        )
+    assert_eq(expect, got)
+
+    # Filter on non-partitioned column.
+    # Cannot compare to pandas, since the pyarrow
+    # backend will filter by row (and cudf can
+    # only filter by column, for now)
+    filters = [("a", "==", 10)]
+    got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,)
+    assert len(got) < len(df) and 10 in got["a"]
+
+    # Filter on both kinds of columns
+    filters = [[("a", "==", 10)], [("c", "==", 1)]]
+    got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,)
+    assert len(got) < len(df) and (1 in got["c"] and 10 in got["a"])
+
+
 def test_parquet_writer_chunked_metadata(tmpdir, simple_pdf, simple_gdf):
     gdf_fname = tmpdir.join("gdf.parquet")
     test_path = "test/path"
@@ -2170,3 +2253,21 @@ def test_parquet_reader_brotli(datadir):
     got = cudf.read_parquet(fname).to_pandas(nullable=True)
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000])
+@pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000])
+def test_parquet_writer_row_group_size(
+    tmpdir, large_int64_gdf, size_bytes, size_rows
+):
+    fname = tmpdir.join("row_group_size.parquet")
+    large_int64_gdf.to_parquet(
+        fname, row_group_size_bytes=size_bytes, row_group_size_rows=size_rows
+    )
+
+    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    # 8 bytes per row, as the column is int64
+    expected_num_rows = max(
+        math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes)
+    )
+    assert expected_num_rows == row_groups
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index dea876891f8..5738e1f0d00 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -346,12 +346,17 @@ def test_read_parquet_filters(s3_base, s3so, pdf, python_file):
     assert_eq(pdf.iloc[:0], got.reset_index(drop=True))
 
 
-def test_write_parquet(s3_base, s3so, pdf):
+@pytest.mark.parametrize("partition_cols", [None, ["String"]])
+def test_write_parquet(s3_base, s3so, pdf, partition_cols):
     fname = "test_parquet_writer.parquet"
     bname = "parquet"
     gdf = cudf.from_pandas(pdf)
     with s3_context(s3_base=s3_base, bucket=bname) as s3fs:
-        gdf.to_parquet("s3://{}/{}".format(bname, fname), storage_options=s3so)
+        gdf.to_parquet(
+            "s3://{}/{}".format(bname, fname),
+            partition_cols=partition_cols,
+            storage_options=s3so,
+        )
         assert s3fs.exists("s3://{}/{}".format(bname, fname))
 
         got = pd.read_parquet(s3fs.open("s3://{}/{}".format(bname, fname)))
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index c75eb91a335..cf52c4684c8 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1746,12 +1746,13 @@ def test_string_wrap(data, width):
         ["A B", "1.5", "3,000"],
         ["23", "³", "⅕", ""],
         [" ", "\t\r\n ", ""],
-        ["$", "B", "Aab$", "$$ca", "C$B$", "cat"],
+        ["$", "B", "Aab$", "$$ca", "C$B$", "cat", "cat\n"],
         ["line\nto be wrapped", "another\nline\nto be wrapped"],
     ],
 )
 @pytest.mark.parametrize(
-    "pat", ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be"]
+    "pat",
+    ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be", "cat$"],
 )
 @pytest.mark.parametrize("flags", [0, re.MULTILINE, re.DOTALL])
 def test_string_count(data, pat, flags):
diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py
index 4d6188acf8c..935c3868a68 100644
--- a/python/cudf/cudf/tests/test_udf_binops.py
+++ b/python/cudf/cudf/tests/test_udf_binops.py
@@ -6,11 +6,20 @@
 from numba.cuda import compile_ptx
 from numba.np import numpy_support
 
+import rmm
+
 import cudf
 from cudf import Series, _lib as libcudf
 from cudf.utils import dtypes as dtypeutils
 
+_driver_version = rmm._cuda.gpu.driverGetVersion()
+_runtime_version = rmm._cuda.gpu.runtimeGetVersion()
+_CUDA_JIT128INT_SUPPORTED = (_driver_version >= 11050) and (
+    _runtime_version >= 11050
+)
+
 
+@pytest.mark.skipif(not _CUDA_JIT128INT_SUPPORTED, reason="requires CUDA 11.5")
 @pytest.mark.parametrize(
     "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"int8"}))
 )
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index dc126546f15..c9c2c440632 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -593,3 +593,22 @@ def func(row, c, k):
         return y
 
     run_masked_udf_test(func, data, args=(1, 2), check_dtype=False)
+
+
+def test_masked_udf_caching():
+    # Make sure similar functions that differ
+    # by simple things like constants actually
+    # recompile
+
+    data = cudf.Series([1, 2, 3])
+    expect = data ** 2
+    got = data.applymap(lambda x: x ** 2)
+
+    assert_eq(expect, got, check_dtype=False)
+
+    # update the constant value being used and make sure
+    # it does not result in a cache hit
+
+    expect = data ** 3
+    got = data.applymap(lambda x: x ** 3)
+    assert_eq(expect, got, check_dtype=False)
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 5fa091a0081..f0533dcaa72 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -216,12 +216,14 @@ def make_cache_key(udf, sig):
     recompiling the same function for the same set of types
     """
     codebytes = udf.__code__.co_code
+    constants = udf.__code__.co_consts
     if udf.__closure__ is not None:
         cvars = tuple([x.cell_contents for x in udf.__closure__])
         cvarbytes = dumps(cvars)
     else:
         cvarbytes = b""
-    return codebytes, cvarbytes, sig
+
+    return constants, codebytes, cvarbytes, sig
 
 
 def compile_udf(udf, type_signature):
diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py
index 77963f8bcc1..dbdd68f2df8 100644
--- a/python/cudf/cudf/utils/gpu_utils.py
+++ b/python/cudf/cudf/utils/gpu_utils.py
@@ -143,7 +143,7 @@ def _try_get_old_or_new_symbols():
             cuda_driver_supported_rt_version >= 11000
             and cuda_runtime_version >= 11000
         ):
-            # With cuda enhanced compatibitlity any code compiled
+            # With cuda enhanced compatibility any code compiled
             # with 11.x version of cuda can now run on any
             # driver >= 450.80.02. 11000 is the minimum cuda
             # version 450.80.02 supports.
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 6746753249c..e6c031acac7 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -154,6 +154,9 @@
 strings_to_categorical : boolean, default False
     If True, return string columns as GDF_CATEGORY dtype; if False, return a
     as GDF_STRING dtype.
+categorical_partitions : boolean, default True
+    Whether directory-partitioned columns should be interpreted as categorical
+    or raw dtypes.
 use_pandas_metadata : boolean, default True
     If True and dataset has custom PANDAS schema metadata, ensure that index
     columns are also loaded.
@@ -221,6 +224,12 @@
     timestamp[us] to the int96 format, which is the number of Julian
     days and the number of nanoseconds since midnight. If ``False``,
     timestamps will not be altered.
+row_group_size_bytes: integer or None, default None
+    Maximum size of each stripe of the output.
+    If None, 13369344 (128MB) will be used.
+row_group_size_rows: integer or None, default None
+    Maximum number of rows of each stripe of the output.
+    If None, 1000000 will be used.
 
 
 See Also
@@ -404,10 +413,10 @@
 stripe_size_bytes: integer or None, default None
     Maximum size of each stripe of the output.
     If None, 67108864 (64MB) will be used.
-stripe_size_rows: integer or None, default None 1000000
+stripe_size_rows: integer or None, default None
     Maximum number of rows of each stripe of the output.
     If None, 1000000 will be used.
-row_index_stride: integer or None, default None 10000
+row_index_stride: integer or None, default None
     Row index stride (maximum number of rows in each row group).
     If None, 10000 will be used.
 
@@ -1032,7 +1041,7 @@
     should consume messages from. Valid values are 0 - (N-1)
 start_offset : int, Kafka Topic/Partition offset that consumption
     should begin at. Inclusive.
-end_offset : int, Kafka Topic/Parition offset that consumption
+end_offset : int, Kafka Topic/Partition offset that consumption
     should end at. Inclusive.
 batch_timeout : int, default 10000
     Maximum number of milliseconds that will be spent trying to
@@ -1055,7 +1064,7 @@
     or any object with a `read()` method (such as builtin `open()` file handler
     function or `StringIO`).
 delimiter : string, default None, The delimiter that should be used
-    for splitting text chunks into seperate cudf column rows. Currently
+    for splitting text chunks into separate cudf column rows. Currently
     only a single delimiter is supported.
 
 Returns
@@ -1123,7 +1132,7 @@ def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
         storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
         try:
-            fs, _, paths = fsspec.get_fs_token_paths(
+            fs, _, paths = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
             )
         except ValueError as e:
@@ -1147,9 +1156,9 @@ def is_directory(path_or_data, **kwargs):
         storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
         try:
-            fs, _, paths = fsspec.get_fs_token_paths(
+            fs = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
-            )
+            )[0]
         except ValueError as e:
             if str(e).startswith("Protocol not known"):
                 return False
@@ -1183,10 +1192,8 @@ def _get_filesystem_and_paths(path_or_data, **kwargs):
         else:
             path_or_data = [path_or_data]
 
-        # Pyarrow did not support the protocol or storage options.
-        # Fall back to fsspec
         try:
-            fs, _, fs_paths = fsspec.get_fs_token_paths(
+            fs, _, fs_paths = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
             )
             return_paths = fs_paths
@@ -1316,9 +1323,9 @@ def get_writer_filepath_or_buffer(path_or_data, mode, **kwargs):
     if isinstance(path_or_data, str):
         storage_options = kwargs.get("storage_options", {})
         path_or_data = os.path.expanduser(path_or_data)
-        fs, _, _ = fsspec.get_fs_token_paths(
+        fs = get_fs_token_paths(
             path_or_data, mode=mode or "w", storage_options=storage_options
-        )
+        )[0]
 
         if not _is_local_filesystem(fs):
             filepath_or_buffer = fsspec.open(
@@ -1507,11 +1514,12 @@ def _prepare_filters(filters):
     return filters
 
 
-def _ensure_filesystem(passed_filesystem, path):
+def _ensure_filesystem(passed_filesystem, path, **kwargs):
     if passed_filesystem is None:
-        return get_fs_token_paths(path[0] if isinstance(path, list) else path)[
-            0
-        ]
+        return get_fs_token_paths(
+            path[0] if isinstance(path, list) else path,
+            storage_options=kwargs.get("storage_options", {}),
+        )[0]
     return passed_filesystem
 
 
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 4f9b23bf6fe..cea384b9c11 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -12,6 +12,7 @@
 import rmm
 
 import cudf
+from cudf._lib.reduce import minmax
 from cudf.core import column
 from cudf.core.buffer import Buffer
 from cudf.utils.dtypes import to_cudf_compatible_scalar
@@ -353,7 +354,7 @@ def get_appropriate_dispatched_func(
 
         elif hasattr(cupy_submodule, fname):
             cupy_func = getattr(cupy_submodule, fname)
-            # Handle case if cupy impliments it as a numpy function
+            # Handle case if cupy implements it as a numpy function
             # Unsure if needed
             if cupy_func is func:
                 return NotImplemented
@@ -374,7 +375,7 @@ def _cast_to_appropriate_cudf_type(val, index=None):
     elif (val.ndim == 1) or (val.ndim == 2 and val.shape[1] == 1):
         # if index is not None and is of a different length
         # than the index, cupy dispatching behaviour is undefined
-        # so we dont impliment it
+        # so we don't implement it
         if (index is None) or (len(index) == len(val)):
             return cudf.Series(val, index=index)
 
@@ -506,3 +507,20 @@ def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
     if (indices == cp.arange(start, stop, step)).all():
         return slice(start, stop, step)
     return indices
+
+
+def _gather_map_is_valid(
+    gather_map: "cudf.core.column.ColumnBase",
+    nrows: int,
+    check_bounds: bool,
+    nullify: bool,
+) -> bool:
+    """Returns true if gather map is valid.
+
+    A gather map is valid if empty or all indices are within the range
+    ``[-nrows, nrows)``, except when ``nullify`` is specifed.
+    """
+    if not check_bounds or nullify or len(gather_map) == 0:
+        return True
+    gm_min, gm_max = minmax(gather_map)
+    return gm_min >= -nrows and gm_max < nrows
diff --git a/python/custreamz/dev_requirements.txt b/python/custreamz/dev_requirements.txt
index de39ab4c7c1..6f1c09947d5 100644
--- a/python/custreamz/dev_requirements.txt
+++ b/python/custreamz/dev_requirements.txt
@@ -3,8 +3,8 @@
 flake8==3.8.3
 black==19.10b0
 isort==5.6.4
-dask>=2021.09.1
-distributed>=2021.09.1
+dask>=2021.11.1,<=2021.11.2
+distributed>=2021.11.1,<=2021.11.2
 streamz
 python-confluent-kafka
 pytest
diff --git a/python/dask_cudf/dask_cudf/_version.py b/python/dask_cudf/dask_cudf/_version.py
index eb7457f3465..8ca2cf98381 100644
--- a/python/dask_cudf/dask_cudf/_version.py
+++ b/python/dask_cudf/dask_cudf/_version.py
@@ -417,7 +417,7 @@ def render_pep440_old(pieces):
 
     The ".dev0" means dirty.
 
-    Eexceptions:
+    Exceptions:
     1: no tags. 0.postDISTANCE[.dev0]
     """
     if pieces["closest-tag"]:
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index f81a4743a4a..89b5301ee83 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -196,7 +196,7 @@ def make_meta_object_cudf(x, index=None):
         )
     elif not hasattr(x, "dtype") and x is not None:
         # could be a string, a dtype object, or a python type. Skip `None`,
-        # because it is implictly converted to `dtype('f8')`, which we don't
+        # because it is implicitly converted to `dtype('f8')`, which we don't
         # want here.
         try:
             dtype = np.dtype(x)
diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index 132201a349e..ebb02e3b6d4 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -110,9 +110,17 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
     if chunksize is None:
         return read_csv_without_chunksize(path, **kwargs)
 
+    # Let dask.dataframe generate meta
     dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
-    usecols = kwargs.pop("usecols", None)
-    meta = dask_reader(filenames[0], **kwargs)._meta
+    kwargs1 = kwargs.copy()
+    usecols = kwargs1.pop("usecols", None)
+    dtype = kwargs1.pop("dtype", None)
+    meta = dask_reader(filenames[0], **kwargs1)._meta
+    names = meta.columns
+    if usecols or dtype:
+        # Regenerate meta with original kwargs if
+        # `usecols` or `dtype` was specified
+        meta = dask_reader(filenames[0], **kwargs)._meta
 
     dsk = {}
     i = 0
@@ -127,18 +135,13 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
                 chunksize,
             )  # specify which chunk of the file we care about
             if start != 0:
-                kwargs2[
-                    "names"
-                ] = meta.columns  # no header in the middle of the file
+                kwargs2["names"] = names  # no header in the middle of the file
                 kwargs2["header"] = None
-            kwargs2["usecols"] = usecols
             dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2)
 
             i += 1
 
     divisions = [None] * (len(dsk) + 1)
-    if usecols is not None:
-        meta = meta[usecols]
     return dd.core.new_dd_object(dsk, name, meta, divisions)
 
 
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 2e5d55e92d2..a49d73493ec 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -111,7 +111,7 @@ def _read_paths(
             frag = next(ds.get_fragments())
             if frag:
                 # Extract hive-partition keys, and make sure they
-                # are orderd the same as they are in `partitions`
+                # are ordered the same as they are in `partitions`
                 raw_keys = pa_ds._get_partition_keys(frag.partition_expression)
                 partition_keys = [
                     (hive_part.name, raw_keys[hive_part.name])
@@ -126,11 +126,8 @@ def _read_paths(
 
                 # Build the column from `codes` directly
                 # (since the category is often a larger dtype)
-                codes = (
-                    as_column(partitions[i].keys.index(index2))
-                    .as_frame()
-                    .repeat(len(df))
-                    ._data[None]
+                codes = as_column(
+                    partitions[i].keys.index(index2), length=len(df),
                 )
                 df[name] = build_categorical_column(
                     categories=partitions[i].keys,
@@ -173,7 +170,7 @@ def read_partition(
 
         strings_to_cats = kwargs.get("strings_to_categorical", False)
 
-        # Assume multi-peice read
+        # Assume multi-piece read
         paths = []
         rgs = []
         last_partition_keys = None
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
index 98061f6c624..32960a90bd7 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
@@ -136,7 +136,8 @@ def test_read_csv_chunksize_none(tmp_path, compression, size):
     dd.assert_eq(df, df2)
 
 
-def test_csv_reader_usecols(tmp_path):
+@pytest.mark.parametrize("dtype", [{"b": str, "c": int}, None])
+def test_csv_reader_usecols(tmp_path, dtype):
     df = cudf.DataFrame(
         {
             "a": [1, 2, 3, 4] * 100,
@@ -147,6 +148,6 @@ def test_csv_reader_usecols(tmp_path):
     csv_path = str(tmp_path / "usecols_data.csv")
     df.to_csv(csv_path, index=False)
     ddf = dask_cudf.from_cudf(df[["b", "c"]], npartitions=5)
-    ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"])
+    ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"], dtype=dtype)
 
     dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index d93037b3802..706b0e272ea 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -378,7 +378,7 @@ def test_chunksize(tmpdir, chunksize, metadata):
             # one output partition
             assert ddf3.npartitions == 1
         else:
-            # Files can be aggregateed together, but
+            # Files can be aggregated together, but
             # chunksize is not large enough to produce
             # a single output partition
             assert ddf3.npartitions < num_row_groups
diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py
index 58811ee98fc..8b2d85c59d7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_join.py
+++ b/python/dask_cudf/dask_cudf/tests/test_join.py
@@ -245,8 +245,6 @@ def test_merge_should_fail():
         left.merge(right, how="left", on=["b"])
     with pytest.raises(KeyError):
         left.merge(right, how="left", on=["c"])
-    with pytest.raises(KeyError):
-        left.merge(right, how="left", on=["a"])
 
     # Same column names
     df2["b"] = np.random.randint(0, 12, 12)
@@ -254,8 +252,6 @@ def test_merge_should_fail():
 
     with pytest.raises(KeyError):
         left.merge(right, how="left", on="NonCol")
-    with pytest.raises(KeyError):
-        left.merge(right, how="left", on="a")
 
 
 @pytest.mark.parametrize("how", ["inner", "left"])
diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt
index d9dc19bfb7f..db85515f379 100644
--- a/python/dask_cudf/dev_requirements.txt
+++ b/python/dask_cudf/dev_requirements.txt
@@ -1,7 +1,7 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
-dask>=2021.09.1
-distributed>=2021.09.1
+dask>=2021.11.1,<=2021.11.2
+distributed>=2021.11.1,<=2021.11.2
 fsspec>=0.6.0
 numba>=0.53.1
 numpy
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index 1a9c2ff048a..b52c2ea37d6 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -10,8 +10,8 @@
 
 install_requires = [
     "cudf",
-    "dask>=2021.09.1",
-    "distributed>=2021.09.1",
+    "dask>=2021.11.1,<=2021.11.2",
+    "distributed>=2021.11.1,<=2021.11.2",
     "fsspec>=0.6.0",
     "numpy",
     "pandas>=1.0,<1.4.0dev0",