diff --git a/cpp/include/cudf_test/print_utilities.cuh b/cpp/include/cudf_test/print_utilities.cuh
new file mode 100644
index 00000000000..37ffcd401fc
--- /dev/null
+++ b/cpp/include/cudf_test/print_utilities.cuh
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/iterator/transform_iterator.h>
+
+#include <type_traits>
+
+namespace cudf::test::print {
+
+constexpr int32_t hex_tag = 0;
+
+template <int32_t TagT, typename T>
+struct TaggedType {
+  T v;
+};
+
+template <typename T>
+using hex_t = TaggedType<hex_tag, T>;
+
+/**
+ * @brief Function object to transform a built-in type to a tagged type (e.g., in order to print
+ * values from an iterator returning uint32_t as hex values)
+ *
+ * @tparam TaggedTypeT A TaggedType template specialisation
+ */
+template <typename TaggedTypeT>
+struct ToTaggedType {
+  template <typename T>
+  CUDF_HOST_DEVICE TaggedTypeT operator()(T const& v) const
+  {
+    return TaggedTypeT{v};
+  }
+};
+
+/**
+ * @brief Returns an iterator that causes the values from \p it to be printed as hex values.
+ *
+ * @tparam InItT A random-access input iterator type
+ * @param it A random-access input iterator t
+ * @return
+ */
+template <typename InItT>
+auto hex(InItT it)
+{
+  using value_t  = typename std::iterator_traits<InItT>::value_type;
+  using tagged_t = hex_t<value_t>;
+  return thrust::make_transform_iterator(it, ToTaggedType<tagged_t>{});
+}
+
+template <typename T, CUDF_ENABLE_IF(std::is_integral_v<T>&& std::is_signed_v<T>)>
+CUDF_HOST_DEVICE void print_value(int32_t width, T arg)
+{
+  printf("%*d", width, arg);
+}
+
+template <typename T, CUDF_ENABLE_IF(std::is_integral_v<T>&& std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE void print_value(int32_t width, T arg)
+{
+  printf("%*d", width, arg);
+}
+
+CUDF_HOST_DEVICE void print_value(int32_t width, char arg) { printf("%*c", width, arg); }
+
+template <typename T>
+CUDF_HOST_DEVICE void print_value(int32_t width, hex_t<T> arg)
+{
+  printf("%*X", width, arg.v);
+}
+
+namespace detail {
+template <typename T>
+CUDF_HOST_DEVICE void print_values(int32_t width, char delimiter, T arg)
+{
+  print_value(width, arg);
+}
+
+template <typename T, typename... Ts>
+CUDF_HOST_DEVICE void print_values(int32_t width, char delimiter, T arg, Ts... args)
+{
+  print_value(width, arg);
+  if (delimiter) printf("%c", delimiter);
+  print_values(width, delimiter, args...);
+}
+
+template <typename... Ts>
+__global__ void print_array_kernel(std::size_t count, int32_t width, char delimiter, Ts... args)
+{
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    for (std::size_t i = 0; i < count; i++) {
+      printf("%6lu: ", i);
+      print_values(width, delimiter, args[i]...);
+      printf("\n");
+    }
+  }
+}
+}  // namespace detail
+
+/**
+ * @brief Prints \p count elements from each of the given device-accessible iterators.
+ *
+ * @param count The number of items to print from each device-accessible iterator
+ * @param stream The cuda stream to which the printing kernel shall be dispatched
+ * @param args List of iterators to be printed
+ */
+template <typename... Ts>
+void print_array(std::size_t count, rmm::cuda_stream_view stream, Ts... args)
+{
+  // The width to pad printed numbers to
+  constexpr int32_t width = 6;
+
+  // Delimiter used for separating values from subsequent iterators
+  constexpr char delimiter = ',';
+
+  // TODO we want this to compile to nothing dependnig on compiler flag, rather than runtime
+  if (std::getenv("CUDA_DBG_DUMP") != nullptr) {
+    detail::print_array_kernel<<<1, 1, 0, stream.value()>>>(count, width, delimiter, args...);
+  }
+}
+
+}  // namespace cudf::test::print
diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh
new file mode 100644
index 00000000000..9502922a379
--- /dev/null
+++ b/cpp/src/io/fst/logical_stack.cuh
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf_test/print_utilities.cuh>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/fill.h>
+#include <thrust/scatter.h>
+
+#include <cub/cub.cuh>
+
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+
+namespace cudf::io::fst {
+
+/**
+ * @brief Describes the kind of stack operation.
+ */
+enum class stack_op_type : int8_t {
+  READ = 0,  ///< Operation reading what is currently on top of the stack
+  PUSH = 1,  ///< Operation pushing a new item on top of the stack
+  POP  = 2   ///< Operation popping the item currently on top of the stack
+};
+
+namespace detail {
+
+/**
+ * @brief A convenience struct that represents a stack operation as a pair, where the stack_level
+ * represents the stack's level and the value represents the stack symbol.
+ *
+ * @tparam StackLevelT The stack level type sufficient to cover all stack levels. Must be signed
+ * type as any subsequence of stack operations must be able to be covered. E.g., consider the first
+ * 10 operations are all push and the last 10 operations are all pop operations, we need to be able
+ * to represent a partial aggregate of the first ten items, which is '+10', just as well as a
+ * partial aggregate of the last ten items, which is '-10'.
+ * @tparam ValueT The value type that corresponds to the stack symbols (i.e., covers the stack
+ * alphabet).
+ */
+template <typename StackLevelT, typename ValueT>
+struct StackOp {
+  // Must be signed type as any subsequence of stack operations must be able to be covered.
+  static_assert(std::is_signed_v<StackLevelT>, "StackLevelT has to be a signed type");
+
+  StackLevelT stack_level;
+  ValueT value;
+};
+
+/**
+ * @brief Helper class to assist with radix sorting StackOp instances by stack level.
+ *
+ * @tparam BYTE_SIZE The size of the StackOp.
+ */
+template <std::size_t BYTE_SIZE>
+struct StackOpToUnsigned {
+  using UnsignedT = void;
+};
+
+template <>
+struct StackOpToUnsigned<2U> {
+  using UnsignedT = uint16_t;
+};
+
+template <>
+struct StackOpToUnsigned<4U> {
+  using UnsignedT = uint32_t;
+};
+
+template <>
+struct StackOpToUnsigned<8U> {
+  using UnsignedT = uint64_t;
+};
+
+/**
+ * @brief Alias template to retrieve an unsigned bit-representation that can be used for radix
+ * sorting the stack level of a StackOp.
+ *
+ * @tparam StackOpT The StackOp class template instance for which to get an unsigned
+ * bit-representation
+ */
+template <typename StackOpT>
+using UnsignedStackOpType = typename StackOpToUnsigned<sizeof(StackOpT)>::UnsignedT;
+
+/**
+ * @brief Function object class template used for converting a stack symbol to a stack
+ * operation that has a stack level to which an operation applies.
+ *
+ * @tparam StackOpT
+ * @tparam StackSymbolToStackOpTypeT
+ */
+template <typename StackOpT, typename StackSymbolToStackOpTypeT>
+struct StackSymbolToStackOp {
+  template <typename StackSymbolT>
+  constexpr CUDF_HOST_DEVICE StackOpT operator()(StackSymbolT const& stack_symbol) const
+  {
+    stack_op_type stack_op = symbol_to_stack_op_type(stack_symbol);
+    // PUSH => +1, POP => -1, READ => 0
+    int32_t level_delta = stack_op == stack_op_type::PUSH  ? 1
+                          : stack_op == stack_op_type::POP ? -1
+                                                           : 0;
+    return StackOpT{static_cast<decltype(StackOpT::stack_level)>(level_delta), stack_symbol};
+  }
+
+  /// Function object returning a stack operation type for a given stack symbol
+  StackSymbolToStackOpTypeT symbol_to_stack_op_type;
+};
+
+/**
+ * @brief Binary reduction operator to compute the absolute stack level from relative stack levels
+ * (i.e., +1 for a PUSH, -1 for a POP operation).
+ */
+struct AddStackLevelFromStackOp {
+  template <typename StackLevelT, typename ValueT>
+  constexpr CUDF_HOST_DEVICE StackOp<StackLevelT, ValueT> operator()(
+    StackOp<StackLevelT, ValueT> const& lhs, StackOp<StackLevelT, ValueT> const& rhs) const
+  {
+    StackLevelT new_level = lhs.stack_level + rhs.stack_level;
+    return StackOp<StackLevelT, ValueT>{new_level, rhs.value};
+  }
+};
+
+/**
+ * @brief Binary reduction operator that propagates a write operation for a specific stack level to
+ * all reads of that same stack level. That is, if the stack level of LHS compares equal to the
+ * stack level of the RHS and if the RHS is a read and the LHS is a write operation type, then we
+ * return LHS, otherwise we return the RHS.
+ */
+template <typename StackSymbolToStackOpTypeT>
+struct PopulatePopWithPush {
+  template <typename StackLevelT, typename ValueT>
+  constexpr CUDF_HOST_DEVICE StackOp<StackLevelT, ValueT> operator()(
+    StackOp<StackLevelT, ValueT> const& lhs, StackOp<StackLevelT, ValueT> const& rhs) const
+  {
+    // If RHS is a read, then we need to figure out whether we can propagate the value from the LHS
+    bool is_rhs_read = symbol_to_stack_op_type(rhs.value) != stack_op_type::PUSH;
+
+    // Whether LHS is a matching write (i.e., the push operation that is on top of the stack for the
+    // RHS's read)
+    bool is_lhs_matching_write = (lhs.stack_level == rhs.stack_level) &&
+                                 symbol_to_stack_op_type(lhs.value) == stack_op_type::PUSH;
+
+    return (is_rhs_read && is_lhs_matching_write) ? lhs : rhs;
+  }
+
+  /// Function object returning a stack operation type for a given stack symbol
+  StackSymbolToStackOpTypeT symbol_to_stack_op_type;
+};
+
+/**
+ * @brief Binary reduction operator that is used to replace each read_symbol occurrence with the
+ * last non-read_symbol that precedes such read_symbol.
+ */
+template <typename StackSymbolT>
+struct PropagateLastWrite {
+  constexpr CUDF_HOST_DEVICE StackSymbolT operator()(StackSymbolT const& lhs,
+                                                     StackSymbolT const& rhs) const
+  {
+    // If RHS is a yet-to-be-propagated, then we need to check whether we can use the LHS to fill
+    bool is_rhs_read = (rhs == read_symbol);
+
+    // We propagate the write from the LHS if it's a write
+    bool is_lhs_write = (lhs != read_symbol);
+
+    return (is_rhs_read && is_lhs_write) ? lhs : rhs;
+  }
+
+  /// The read_symbol that is supposed to be replaced
+  StackSymbolT read_symbol;
+};
+
+/**
+ * @brief Helper function object class to convert a StackOp to the stack symbol of that
+ * StackOp.
+ */
+struct StackOpToStackSymbol {
+  template <typename StackLevelT, typename ValueT>
+  constexpr CUDF_HOST_DEVICE ValueT operator()(StackOp<StackLevelT, ValueT> const& kv_op) const
+  {
+    return kv_op.value;
+  }
+};
+
+/**
+ * @brief Replaces all operations that apply to stack level '0' with the empty stack symbol
+ */
+template <typename StackOpT>
+struct RemapEmptyStack {
+  constexpr CUDF_HOST_DEVICE StackOpT operator()(StackOpT const& kv_op) const
+  {
+    return kv_op.stack_level == 0 ? empty_stack_symbol : kv_op;
+  }
+  StackOpT empty_stack_symbol;
+};
+
+}  // namespace detail
+
+/**
+ * @brief Takes a sparse representation of a sequence of stack operations that either push something
+ * onto the stack or pop something from the stack and resolves the symbol that is on top of the
+ * stack.
+ *
+ * @tparam StackLevelT Signed integer type that must be sufficient to cover [-max_stack_level,
+ * max_stack_level] for the given sequence of stack operations. Must be signed as it needs to cover
+ * the stack level of any arbitrary subsequence of stack operations.
+ * @tparam StackSymbolItT An input iterator type that provides the sequence of symbols that
+ * represent stack operations
+ * @tparam SymbolPositionT The index that this stack operation is supposed to apply to
+ * @tparam StackSymbolToStackOpTypeT Function object class to transform items from StackSymbolItT to
+ * stack_op_type
+ * @tparam TopOfStackOutItT Output iterator type to which StackSymbolT are being assigned
+ * @tparam StackSymbolT The internal type being used (usually corresponding to StackSymbolItT's
+ * value_type)
+ * @tparam OffsetT Signed or unsigned integer type large enough to index into both the sparse input
+ * sequence and the top-of-stack output sequence
+ *
+ * @param[in] d_symbols Sequence of symbols that represent stack operations. Memory may alias with
+ * \p d_top_of_stack
+ * @param[in,out] d_symbol_positions Sequence of symbol positions (for a sparse representation),
+ * sequence must be ordered in ascending order. Note, the memory of this array is repurposed for
+ * double-buffering.
+ * @param[in] symbol_to_stack_op Function object that returns a stack operation type (push, pop, or
+ * read) for a given symbol from \p d_symbols
+ * @param[out] d_top_of_stack A random access output iterator that will be populated with
+ * what-is-on-top-of-the-stack for the given sequence of stack operations \p d_symbols
+ * @param[in] empty_stack_symbol The symbol that will be written to top_of_stack whenever the stack
+ * was empty
+ * @param[in] read_symbol A symbol that may not be confused for a symbol that would push to the
+ * stack
+ * @param[in] num_symbols_out The number of symbols that are supposed to be filled with
+ * what-is-on-top-of-the-stack
+ * @param[in] stream The cuda stream to which to dispatch the work
+ */
+template <typename StackLevelT,
+          typename StackSymbolItT,
+          typename SymbolPositionT,
+          typename StackSymbolToStackOpTypeT,
+          typename TopOfStackOutItT,
+          typename StackSymbolT>
+void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
+                                     device_span<SymbolPositionT> d_symbol_positions,
+                                     StackSymbolToStackOpTypeT symbol_to_stack_op,
+                                     TopOfStackOutItT d_top_of_stack,
+                                     StackSymbolT const empty_stack_symbol,
+                                     StackSymbolT const read_symbol,
+                                     std::size_t const num_symbols_out,
+                                     rmm::cuda_stream_view stream = cudf::default_stream_value)
+{
+  rmm::device_buffer temp_storage{};
+
+  // Type used to hold pairs of (stack_level, value) pairs
+  using StackOpT = detail::StackOp<StackLevelT, StackSymbolT>;
+
+  // The unsigned integer type that we use for radix sorting items of type StackOpT
+  using StackOpUnsignedT = detail::UnsignedStackOpType<StackOpT>;
+  static_assert(!std::is_void<StackOpUnsignedT>(), "unsupported StackOpT size");
+
+  // Transforming sequence of stack symbols to stack operations
+  using StackSymbolToStackOpT = detail::StackSymbolToStackOp<StackOpT, StackSymbolToStackOpTypeT>;
+
+  // TransformInputIterator converting stack symbols to stack operations
+  using TransformInputItT =
+    cub::TransformInputIterator<StackOpT, StackSymbolToStackOpT, StackSymbolItT>;
+
+  auto const num_symbols_in = d_symbol_positions.size();
+
+  // Converting a stack symbol that may either push or pop to a stack operation:
+  // stack_symbol -> ([+1,0,-1], stack_symbol)
+  StackSymbolToStackOpT stack_sym_to_kv_op{symbol_to_stack_op};
+  TransformInputItT stack_symbols_in(d_symbols, stack_sym_to_kv_op);
+
+  // Double-buffer for sorting along the given sequence of symbol positions (the sparse
+  // representation)
+  cub::DoubleBuffer<SymbolPositionT> d_symbol_positions_db{nullptr, nullptr};
+
+  // Double-buffer for sorting the stack operations by the stack level to which such operation
+  // applies
+  cub::DoubleBuffer<StackOpT> d_kv_operations{nullptr, nullptr};
+
+  // A double-buffer that aliases memory from d_kv_operations with unsigned types in order to
+  // be able to perform a radix sort
+  cub::DoubleBuffer<StackOpUnsignedT> d_kv_operations_unsigned{nullptr, nullptr};
+
+  constexpr std::size_t bits_per_byte = 8;
+  constexpr std::size_t begin_bit     = offsetof(StackOpT, stack_level) * bits_per_byte;
+  constexpr std::size_t end_bit       = begin_bit + (sizeof(StackOpT::stack_level) * bits_per_byte);
+
+  // The stack operation that makes sure that reads for stack level '0' will be populated
+  // with the empty_stack_symbol
+  StackOpT const empty_stack{0, empty_stack_symbol};
+
+  cub::TransformInputIterator<StackOpT, detail::RemapEmptyStack<StackOpT>, StackOpT*>
+    kv_ops_scan_in(nullptr, detail::RemapEmptyStack<StackOpT>{empty_stack});
+  StackOpT* kv_ops_scan_out = nullptr;
+
+  std::size_t stack_level_scan_bytes      = 0;
+  std::size_t stack_level_sort_bytes      = 0;
+  std::size_t match_level_scan_bytes      = 0;
+  std::size_t propagate_writes_scan_bytes = 0;
+
+  // Getting temporary storage requirements for the prefix sum of the stack level after each
+  // operation
+  CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(nullptr,
+                                               stack_level_scan_bytes,
+                                               stack_symbols_in,
+                                               d_kv_operations.Current(),
+                                               detail::AddStackLevelFromStackOp{},
+                                               num_symbols_in,
+                                               stream));
+
+  // Getting temporary storage requirements for the stable radix sort (sorting by stack level of the
+  // operations)
+  CUDF_CUDA_TRY(cub::DeviceRadixSort::SortPairs(nullptr,
+                                                stack_level_sort_bytes,
+                                                d_kv_operations_unsigned,
+                                                d_symbol_positions_db,
+                                                num_symbols_in,
+                                                begin_bit,
+                                                end_bit,
+                                                stream));
+
+  // Getting temporary storage requirements for the scan to match pop operations with the latest
+  // push of the same level
+  CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(
+    nullptr,
+    match_level_scan_bytes,
+    kv_ops_scan_in,
+    kv_ops_scan_out,
+    detail::PopulatePopWithPush<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
+    num_symbols_in,
+    stream));
+
+  // Getting temporary storage requirements for the scan to propagate top-of-stack for spots that
+  // didn't push or pop
+  CUDF_CUDA_TRY(
+    cub::DeviceScan::ExclusiveScan(nullptr,
+                                   propagate_writes_scan_bytes,
+                                   d_top_of_stack,
+                                   d_top_of_stack,
+                                   detail::PropagateLastWrite<StackSymbolT>{read_symbol},
+                                   empty_stack_symbol,
+                                   num_symbols_out,
+                                   stream));
+
+  // Scratch memory required by the algorithms
+  auto total_temp_storage_bytes = std::max({stack_level_scan_bytes,
+                                            stack_level_sort_bytes,
+                                            match_level_scan_bytes,
+                                            propagate_writes_scan_bytes});
+
+  if (temp_storage.size() < total_temp_storage_bytes) {
+    temp_storage.resize(total_temp_storage_bytes, stream);
+  }
+  // Actual device buffer size, as we need to pass in an lvalue-ref to cub algorithms as
+  // temp_storage_bytes
+  total_temp_storage_bytes = temp_storage.size();
+
+  rmm::device_uvector<SymbolPositionT> d_symbol_position_alt{num_symbols_in, stream};
+  rmm::device_uvector<StackOpT> d_kv_ops_current{num_symbols_in, stream};
+  rmm::device_uvector<StackOpT> d_kv_ops_alt{num_symbols_in, stream};
+
+  //------------------------------------------------------------------------------
+  // ALGORITHM
+  //------------------------------------------------------------------------------
+  // Initialize double-buffer for sorting the indexes of the sequence of sparse stack operations
+  d_symbol_positions_db =
+    cub::DoubleBuffer<SymbolPositionT>{d_symbol_positions.data(), d_symbol_position_alt.data()};
+
+  // Initialize double-buffer for sorting the indexes of the sequence of sparse stack operations
+  d_kv_operations = cub::DoubleBuffer<StackOpT>{d_kv_ops_current.data(), d_kv_ops_alt.data()};
+
+  // Compute prefix sum of the stack level after each operation
+  CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(temp_storage.data(),
+                                               total_temp_storage_bytes,
+                                               stack_symbols_in,
+                                               d_kv_operations.Current(),
+                                               detail::AddStackLevelFromStackOp{},
+                                               num_symbols_in,
+                                               stream));
+
+  // Stable radix sort, sorting by stack level of the operations
+  d_kv_operations_unsigned = cub::DoubleBuffer<StackOpUnsignedT>{
+    reinterpret_cast<StackOpUnsignedT*>(d_kv_operations.Current()),
+    reinterpret_cast<StackOpUnsignedT*>(d_kv_operations.Alternate())};
+  CUDF_CUDA_TRY(cub::DeviceRadixSort::SortPairs(temp_storage.data(),
+                                                total_temp_storage_bytes,
+                                                d_kv_operations_unsigned,
+                                                d_symbol_positions_db,
+                                                num_symbols_in,
+                                                begin_bit,
+                                                end_bit,
+                                                stream));
+
+  // TransformInputIterator that remaps all operations on stack level 0 to the empty stack symbol
+  kv_ops_scan_in  = {reinterpret_cast<StackOpT*>(d_kv_operations_unsigned.Current()),
+                    detail::RemapEmptyStack<StackOpT>{empty_stack}};
+  kv_ops_scan_out = reinterpret_cast<StackOpT*>(d_kv_operations_unsigned.Alternate());
+
+  // Inclusive scan to match pop operations with the latest push operation of that level
+  CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(
+    temp_storage.data(),
+    total_temp_storage_bytes,
+    kv_ops_scan_in,
+    kv_ops_scan_out,
+    detail::PopulatePopWithPush<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
+    num_symbols_in,
+    stream));
+
+  // Fill the output tape with read-symbol
+  thrust::fill(rmm::exec_policy(stream),
+               thrust::device_ptr<StackSymbolT>{d_top_of_stack},
+               thrust::device_ptr<StackSymbolT>{d_top_of_stack + num_symbols_out},
+               read_symbol);
+
+  // Transform the stack operations to the stack symbol they represent
+  cub::TransformInputIterator<StackSymbolT, detail::StackOpToStackSymbol, StackOpT*>
+    kv_op_to_stack_sym_it(kv_ops_scan_out, detail::StackOpToStackSymbol{});
+
+  // Scatter the stack symbols to the output tape (spots that are not scattered to have been
+  // pre-filled with the read-symbol)
+  thrust::scatter(rmm::exec_policy(stream),
+                  kv_op_to_stack_sym_it,
+                  kv_op_to_stack_sym_it + num_symbols_in,
+                  d_symbol_positions_db.Current(),
+                  d_top_of_stack);
+
+  // We perform an exclusive scan in order to fill the items at the very left that may
+  // be reading the empty stack before there's the first push occurrence in the sequence.
+  // Also, we're interested in the top-of-the-stack symbol before the operation was applied.
+  CUDF_CUDA_TRY(
+    cub::DeviceScan::ExclusiveScan(temp_storage.data(),
+                                   total_temp_storage_bytes,
+                                   d_top_of_stack,
+                                   d_top_of_stack,
+                                   detail::PropagateLastWrite<StackSymbolT>{read_symbol},
+                                   empty_stack_symbol,
+                                   num_symbols_out,
+                                   stream));
+}
+
+}  // namespace cudf::io::fst
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8d8fc3210bb..e1970af6212 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -224,6 +224,7 @@ ConfigureTest(PARQUET_TEST io/parquet_test.cpp)
 ConfigureTest(JSON_TEST io/json_test.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
+ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)
 if(CUDF_ENABLE_ARROW_S3)
   target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED")
 endif()
diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu
new file mode 100644
index 00000000000..3c2cdd7fb5c
--- /dev/null
+++ b/cpp/tests/io/fst/logical_stack_test.cu
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+
+#include <cudf/types.hpp>
+#include <io/utilities/hostdevice_vector.hpp>
+#include <src/io/fst/logical_stack.cuh>
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cstdlib>
+#include <iostream>
+#include <iterator>
+#include <stack>
+#include <vector>
+
+namespace {
+namespace fst = cudf::io::fst;
+
+/**
+ * @brief Generates the sparse representation of stack operations to feed into the logical
+ * stack
+ *
+ * @param begin Forward input iterator to the first item of symbols that are checked for whether
+ * they push or pop
+ * @param end Forward input iterator to one one past the last item of symbols that are checked for
+ * whether they push or pop
+ * @param to_stack_op A function object that takes an instance of InputItT's value type and
+ * returns the kind of stack operation such item represents (i.e., of type stack_op_type)
+ * @param stack_symbol_out Forward output iterator to which symbols that either push or pop are
+ * assigned
+ * @param stack_op_index_out Forward output iterator to which the indexes of symbols that either
+ * push or pop are assigned
+ * @return Pair of iterators to one past the last item of the items written to \p stack_symbol_out
+ * and \p stack_op_index_out, respectively
+ */
+template <typename InputItT,
+          typename ToStackOpTypeT,
+          typename StackSymbolOutItT,
+          typename StackOpIndexOutItT>
+std::pair<StackSymbolOutItT, StackOpIndexOutItT> to_sparse_stack_symbols(
+  InputItT begin,
+  InputItT end,
+  ToStackOpTypeT to_stack_op,
+  StackSymbolOutItT stack_symbol_out,
+  StackOpIndexOutItT stack_op_index_out)
+{
+  std::size_t index = 0;
+  for (auto it = begin; it < end; it++) {
+    fst::stack_op_type op_type = to_stack_op(*it);
+    if (op_type == fst::stack_op_type::PUSH || op_type == fst::stack_op_type::POP) {
+      *stack_symbol_out   = *it;
+      *stack_op_index_out = index;
+      stack_symbol_out++;
+      stack_op_index_out++;
+    }
+    index++;
+  }
+  return std::make_pair(stack_symbol_out, stack_op_index_out);
+}
+
+/**
+ * @brief Reads in a sequence of items that represent stack operations, applies these operations to
+ * a stack, and, for every operation being read in, outputs what was the symbol on top of the stack
+ * before the operations was applied. In case the stack is empty before any operation,
+ * \p empty_stack will be output instead.
+ *
+ * @tparam InputItT Forward input iterator type to items representing stack operations
+ * @tparam ToStackOpTypeT A transform function object class that maps an item representing a stack
+ * operation to the stack_op_type of such item
+ * @tparam StackSymbolT Type representing items being pushed onto the stack
+ * @tparam TopOfStackOutItT A forward output iterator type being assigned items of StackSymbolT
+ * @param[in] begin Forward iterator to the beginning of the items representing stack operations
+ * @param[in] end Iterator to one past the last item representing the stack operation
+ * @param[in] to_stack_op A function object that takes an instance of InputItT's value type and
+ * returns the kind of stack operation such item represents (i.e., of type stack_op_type)
+ * @param[in] empty_stack A symbol that will be written to top_of_stack_out_it whenever the stack
+ * was empty
+ * @param[out] top_of_stack The output iterator to which the item will be written to
+ * @return TopOfStackOutItT Iterators to one past the last element that was written
+ */
+template <typename InputItT,
+          typename ToStackOpTypeT,
+          typename StackSymbolT,
+          typename TopOfStackOutItT>
+TopOfStackOutItT to_top_of_stack(InputItT begin,
+                                 InputItT end,
+                                 ToStackOpTypeT to_stack_op,
+                                 StackSymbolT empty_stack,
+                                 TopOfStackOutItT top_of_stack_out_it)
+{
+  // This is the data structure that keeps track of the full stack state for each input symbol
+  std::stack<StackSymbolT> stack_state;
+
+  for (auto it = begin; it < end; it++) {
+    // Write what is currently on top of the stack when reading in the current symbol
+    *top_of_stack_out_it = stack_state.empty() ? empty_stack : stack_state.top();
+    top_of_stack_out_it++;
+
+    auto const& current        = *it;
+    fst::stack_op_type op_type = to_stack_op(current);
+
+    // Check whether this symbol corresponds to a push or pop operation and modify the stack
+    // accordingly
+    if (op_type == fst::stack_op_type::PUSH) {
+      stack_state.push(current);
+    } else if (op_type == fst::stack_op_type::POP) {
+      stack_state.pop();
+    }
+  }
+  return top_of_stack_out_it;
+}
+
+/**
+ * @brief Function object used to filter for brackets and braces that represent push and pop
+ * operations
+ *
+ */
+struct JSONToStackOp {
+  template <typename StackSymbolT>
+  constexpr CUDF_HOST_DEVICE fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const
+  {
+    return (stack_symbol == '{' || stack_symbol == '[')   ? fst::stack_op_type::PUSH
+           : (stack_symbol == '}' || stack_symbol == ']') ? fst::stack_op_type::POP
+                                                          : fst::stack_op_type::READ;
+  }
+};
+}  // namespace
+
+// Base test fixture for tests
+struct LogicalStackTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(LogicalStackTest, GroundTruth)
+{
+  // Type sufficient to cover any stack level (must be a signed type)
+  using StackLevelT   = int8_t;
+  using SymbolT       = char;
+  using SymbolOffsetT = uint32_t;
+
+  // The stack symbol that we'll fill everywhere where there's nothing on the stack
+  constexpr SymbolT empty_stack_symbol = '_';
+
+  // This just has to be a stack symbol that may not be confused with a symbol that would push
+  constexpr SymbolT read_symbol = 'x';
+
+  // Prepare cuda stream for data transfers & kernels
+  rmm::cuda_stream stream{};
+  rmm::cuda_stream_view stream_view(stream);
+
+  // Test input,
+  std::string input = R"(  {)"
+                      R"(category": "reference",)"
+                      R"("index:" [4,12,42],)"
+                      R"("author": "Nigel Rees",)"
+                      R"("title": "Sayings of the Century",)"
+                      R"("price": 8.95)"
+                      R"(}  )"
+                      R"({)"
+                      R"("category": "reference",)"
+                      R"("index:" [4,{},null,{"a":[]}],)"
+                      R"("author": "Nigel Rees",)"
+                      R"("title": "Sayings of the Century",)"
+                      R"("price": 8.95)"
+                      R"(}  {} [] [ ])";
+
+  // Repeat input sample 1024x
+  for (std::size_t i = 0; i < 10; i++)
+    input += input;
+
+  // Input's size
+  std::size_t string_size = input.size();
+
+  // Getting the symbols that actually modify the stack (i.e., symbols that push or pop)
+  std::string stack_symbols{};
+  std::vector<SymbolOffsetT> stack_op_indexes;
+  stack_op_indexes.reserve(string_size);
+
+  // Get the sparse representation of stack operations
+  to_sparse_stack_symbols(std::cbegin(input),
+                          std::cend(input),
+                          JSONToStackOp{},
+                          std::back_inserter(stack_symbols),
+                          std::back_inserter(stack_op_indexes));
+
+  rmm::device_uvector<SymbolT> d_stack_ops{stack_symbols.size(), stream_view};
+  rmm::device_uvector<SymbolOffsetT> d_stack_op_indexes{stack_op_indexes.size(), stream_view};
+  hostdevice_vector<SymbolT> top_of_stack_gpu{string_size, stream_view};
+  cudf::device_span<SymbolOffsetT> d_stack_op_idx_span{d_stack_op_indexes};
+
+  cudaMemcpyAsync(d_stack_ops.data(),
+                  stack_symbols.data(),
+                  stack_symbols.size() * sizeof(SymbolT),
+                  cudaMemcpyHostToDevice,
+                  stream.value());
+
+  cudaMemcpyAsync(d_stack_op_indexes.data(),
+                  stack_op_indexes.data(),
+                  stack_op_indexes.size() * sizeof(SymbolOffsetT),
+                  cudaMemcpyHostToDevice,
+                  stream.value());
+
+  // Run algorithm
+  fst::sparse_stack_op_to_top_of_stack<StackLevelT>(d_stack_ops.data(),
+                                                    d_stack_op_idx_span,
+                                                    JSONToStackOp{},
+                                                    top_of_stack_gpu.device_ptr(),
+                                                    empty_stack_symbol,
+                                                    read_symbol,
+                                                    string_size,
+                                                    stream.value());
+
+  // Async copy results from device to host
+  top_of_stack_gpu.device_to_host(stream_view);
+
+  // Get CPU-side results for verification
+  std::string top_of_stack_cpu{};
+  top_of_stack_cpu.reserve(string_size);
+  to_top_of_stack(std::cbegin(input),
+                  std::cend(input),
+                  JSONToStackOp{},
+                  empty_stack_symbol,
+                  std::back_inserter(top_of_stack_cpu));
+
+  // Make sure results have been copied back to host
+  stream.synchronize();
+
+  // Verify results
+  ASSERT_EQ(string_size, top_of_stack_cpu.size());
+  ASSERT_EQ(top_of_stack_gpu.size(), top_of_stack_cpu.size());
+  for (size_t i = 0; i < string_size && i < top_of_stack_cpu.size(); i++) {
+    ASSERT_EQ(top_of_stack_gpu.host_ptr()[i], top_of_stack_cpu[i]) << "Mismatch at index #" << i;
+  }
+}
+
+CUDF_TEST_PROGRAM_MAIN()