From 5b4a6fcf5bd7c8e3bffd4ebf921da10ac5d33280 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 28 Mar 2022 06:03:56 -0700 Subject: [PATCH 001/173] Squashed with initial test set --- cpp/src/io/fst/logical_stack.cuh | 492 +++++++++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/io/fst/logical_stack_test.cu | 275 ++++++++++++++ 3 files changed, 768 insertions(+) create mode 100644 cpp/src/io/fst/logical_stack.cuh create mode 100644 cpp/tests/io/fst/logical_stack_test.cu diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh new file mode 100644 index 00000000000..7069ee3b404 --- /dev/null +++ b/cpp/src/io/fst/logical_stack.cuh @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace io { +namespace fst { + +/** + * @brief Describes the kind of stack operation. + */ +enum class stack_op_type : int32_t { + READ = 0, ///< Operation reading what is currently on top of the stack + PUSH = 1, ///< Operation pushing a new item on top of the stack + POP = 2 ///< Operation popping the item currently on top of the stack +}; + +namespace detail { + +/** + * @brief A convenience struct that represents a stack opepration as a key-value pair, where the key + * represents the stack's level and the value represents the stack symbol. + * + * @tparam KeyT The key type sufficient to cover all stack levels. Must be signed type as any + * subsequence of stack operations must be able to be covered. E.g., consider the first 10 + * operations are all push and the last 10 operations are all pop operations, we need to be able to + * represent a partial aggregate of the first ten items, which is '+10', just as well as a partial + * aggregate of the last ten items, which is '-10'. + * @tparam ValueT The value type that corresponds to the stack symbols (i.e., covers the stack + * alphabet). + */ +template +struct KeyValueOp { + KeyT key; + ValueT value; +}; + +/** + * @brief Helper class to assist with radix sorting KeyValueOp instances by key. + * + * @tparam BYTE_SIZE The size of the KeyValueOp. + */ +template +struct KeyValueOpToUnsigned { +}; + +template <> +struct KeyValueOpToUnsigned<1U> { + using UnsignedT = uint8_t; +}; + +template <> +struct KeyValueOpToUnsigned<2U> { + using UnsignedT = uint16_t; +}; + +template <> +struct KeyValueOpToUnsigned<4U> { + using UnsignedT = uint32_t; +}; + +template <> +struct KeyValueOpToUnsigned<8U> { + using UnsignedT = uint64_t; +}; + +/** + * @brief Alias template to retrieve an unsigned bit-representation that can be used for radix + * sorting the key of a KeyValueOp. + * + * @tparam KeyValueOpT The KeyValueOp class template instance for which to get an unsigned + * bit-representation + */ +template +using UnsignedKeyValueOpType = typename KeyValueOpToUnsigned::UnsignedT; + +/** + * @brief Function object class template used for converting a stack operation to a key-value store + * operation, where the key corresponds to the stack level being accessed. + * + * @tparam KeyValueOpT + * @tparam StackSymbolToStackOpTypeT + */ +template +struct StackSymbolToKVOp { + template + __host__ __device__ __forceinline__ KeyValueOpT operator()(StackSymbolT const& stack_symbol) const + { + stack_op_type stack_op = symbol_to_stack_op_type(stack_symbol); + // PUSH => +1, POP => -1, READ => 0 + int32_t level_delta = stack_op == stack_op_type::PUSH ? 1 + : stack_op == stack_op_type::POP ? -1 + : 0; + return KeyValueOpT{static_cast(level_delta), stack_symbol}; + } + + /// Function object returning a stack operation type for a given stack symbol + StackSymbolToStackOpTypeT symbol_to_stack_op_type; +}; + +/** + * @brief Binary reduction operator to compute the absolute stack level from relative stack levels + * (i.e., +1 for a PUSH, -1 for a POP operation). + */ +struct AddStackLevelFromKVOp { + template + __host__ __device__ __forceinline__ KeyValueOp operator()( + KeyValueOp const& lhs, KeyValueOp const& rhs) const + { + KeyT new_level = lhs.key + rhs.key; + return KeyValueOp{new_level, rhs.value}; + } +}; + +/** + * @brief Binary reduction operator that propagates a write operation for a specific key to all + * reads of that same key. That is, if the key of LHS compares equal to the key of the RHS and if + * the RHS is a read and the LHS is a write operation type, then we return LHS, otherwise we return + * the RHS. + */ +template +struct PopulatePopWithPush { + template + __host__ __device__ __forceinline__ KeyValueOp operator()( + KeyValueOp const& lhs, KeyValueOp const& rhs) const + { + // If RHS is a read, then we need to figure out whether we can propagate the value from the LHS + bool is_rhs_read = symbol_to_stack_op_type(rhs.value) != stack_op_type::PUSH; + + // Whether LHS is a matching write (i.e., the push operation that is on top of the stack for the + // RHS's read) + bool is_lhs_matching_write = + (lhs.key == rhs.key) && symbol_to_stack_op_type(lhs.value) == stack_op_type::PUSH; + + return (is_rhs_read && is_lhs_matching_write) ? lhs : rhs; + } + + /// Function object returning a stack operation type for a given stack symbol + StackSymbolToStackOpTypeT symbol_to_stack_op_type; +}; + +/** + * @brief Binary reduction operator that is used to replace each read_symbol occurance with the last + * non-read_symbol that precedes such read_symbol. + */ +template +struct PropagateLastWrite { + __host__ __device__ __forceinline__ StackSymbolT operator()(StackSymbolT const& lhs, + StackSymbolT const& rhs) const + { + // If RHS is a yet-to-be-propagated, then we need to check whether we can use the LHS to fill + bool is_rhs_read = (rhs == read_symbol); + + // We propagate the write from the LHS if it's a write + bool is_lhs_write = (lhs != read_symbol); + + return (is_rhs_read && is_lhs_write) ? lhs : rhs; + } + + /// The read_symbol that is supposed to be replaced + StackSymbolT read_symbol; +}; + +/** + * @brief Helper function object class to convert a KeyValueOp to the stack symbol of that + * KeyValueOp. + */ +struct KVOpToStackSymbol { + template + __host__ __device__ __forceinline__ ValueT operator()(KeyValueOp const& kv_op) const + { + return kv_op.value; + } +}; + +/** + * @brief Replaces all operations that apply to stack level '0' with the empty stack symbol + */ +template +struct RemapEmptyStack { + __host__ __device__ __forceinline__ KeyValueOpT operator()(KeyValueOpT const& kv_op) const + { + return kv_op.key == 0 ? empty_stack_symbol : kv_op; + } + KeyValueOpT empty_stack_symbol; +}; + +} // namespace detail + +/** + * @brief Takes a sparse representation of a sequence of stack operations that either push something + * onto the stack or pop something from the stack and resolves the symbol that is on top of the + * stack. + * + * @tparam StackLevelT Signed integer type that must be sufficient to cover [-max_stack_level, + * max_stack_level] for the given sequence of stack operations. Must be signed as it needs to cover + * the stack level of any arbitrary subsequence of stack operations. + * @tparam StackSymbolItT An input iterator type that provides the sequence of symbols that + * represent stack operations + * @tparam SymbolPositionT The index that this stack operation is supposed to apply to + * @tparam StackSymbolToStackOpT Function object class to transform items from StackSymbolItT to + * stack_op_type + * @tparam TopOfStackOutItT Output iterator type to which StackSymbolT are being assigned + * @tparam StackSymbolT The internal type being used (usually corresponding to StackSymbolItT's + * value_type) + * @tparam OffsetT Signed or unsigned integer type large enough to index into both the sparse input + * sequence and the top-of-stack output sequence + * @param[in] d_symbols Sequence of symbols that represent stack operations. Memory may alias with + * \p d_top_of_stack + * @param[in,out] d_symbol_positions Sequence of symbol positions (for a sparse representation), + * sequence must be ordered in ascending order. Note, the memory of this array is repurposed for + * double-buffering. + * @param[in] symbol_to_stack_op Function object that returns a stack operation type (push, pop, or + * read) for a given symbol from \p d_symbols + * @param[out] d_top_of_stack A random access output iterator that will be populated with + * what-is-on-top-of-the-stack for the given sequence of stack operations \p d_symbols + * @param[in] empty_stack_symbol The symbol that will be written to top_of_stack whenever the stack + * was empty + * @param[in] read_symbol A symbol that may not be confused for a symbol that would push to the + * stack + * @param[in] num_symbols_in The number of symbols in the sparse representation + * @param[in] num_symbols_out The number of symbols that are supposed to be filled with + * what-is-on-top-of-the-stack + * @param[in] stream The cuda stream to which to dispatch the work + */ +template +void SparseStackOpToTopOfStack(void* d_temp_storage, + size_t& temp_storage_bytes, + StackSymbolItT d_symbols, + SymbolPositionT* d_symbol_positions, + StackSymbolToStackOpT symbol_to_stack_op, + TopOfStackOutItT d_top_of_stack, + StackSymbolT empty_stack_symbol, + StackSymbolT read_symbol, + OffsetT num_symbols_in, + OffsetT num_symbols_out, + cudaStream_t stream = nullptr) +{ + // Type used to hold key-value pairs (key being the stack level and the value being the stack + // symbol) + using KeyValueOpT = detail::KeyValueOp; + + // The unsigned integer type that we use for radix sorting items of type KeyValueOpT + using KVOpUnsignedT = detail::UnsignedKeyValueOpType; + + // Transforming sequence of stack symbols to key-value store operations, where the key corresponds + // to the stack level of a given stack operation and the value corresponds to the stack symbol of + // that operation + using StackSymbolToKVOpT = detail::StackSymbolToKVOp; + + // TransformInputIterator converting stack symbols to key-value store operations + using TransformInputItT = + cub::TransformInputIterator; + + // Converting a stack symbol that may either push or pop to a key-value store operation: + // stack_symbol -> ([+1,0,-1], stack_symbol) + StackSymbolToKVOpT stack_sym_to_kv_op{symbol_to_stack_op}; + TransformInputItT stack_symbols_in(d_symbols, stack_sym_to_kv_op); + + // Double-buffer for sorting along the given sequence of symbol positions (the sparse + // representation) + cub::DoubleBuffer d_symbol_positions_db{nullptr, nullptr}; + + // Double-buffer for sorting the key-value store operations + cub::DoubleBuffer d_kv_operations{nullptr, nullptr}; + + // A double-buffer that aliases memory from d_kv_operations but offset by one item (to discard the + // exclusive scans first item) + cub::DoubleBuffer d_kv_operations_offset{nullptr, nullptr}; + + // A double-buffer that aliases memory from d_kv_operations_offset with unsigned types in order to + // be able to perform a radix sort + cub::DoubleBuffer d_kv_operations_unsigned{nullptr, nullptr}; + + constexpr std::size_t bits_per_byte = 8; + constexpr std::size_t begin_bit = offsetof(KeyValueOpT, key) * bits_per_byte; + constexpr std::size_t end_bit = begin_bit + (sizeof(KeyValueOpT::key) * bits_per_byte); + + // The key-value store operation that makes sure that reads for stack level '0' will be populated + // with the empty_stack_symbol + KeyValueOpT const empty_stack{0, empty_stack_symbol}; + + cub::TransformInputIterator, KeyValueOpT*> + kv_ops_scan_in(nullptr, detail::RemapEmptyStack{empty_stack}); + KeyValueOpT* kv_ops_scan_out = nullptr; + + //------------------------------------------------------------------------------ + // MEMORY REQUIREMENTS + //------------------------------------------------------------------------------ + enum mem_alloc_id { + temp_storage = 0, + symbol_position_alt, + kv_ops_current, + kv_ops_alt, + num_allocations + }; + + void* allocations[mem_alloc_id::num_allocations] = {nullptr}; + std::size_t allocation_sizes[mem_alloc_id::num_allocations] = {0}; + + std::size_t stack_level_scan_bytes = 0; + std::size_t stack_level_sort_bytes = 0; + std::size_t match_level_scan_bytes = 0; + std::size_t propagate_writes_scan_bytes = 0; + + // Getting temporary storage requirements for the prefix sum of the stack level after each + // operation + CUDA_TRY(cub::DeviceScan::InclusiveScan(nullptr, + stack_level_scan_bytes, + stack_symbols_in, + d_kv_operations_offset.Current(), + detail::AddStackLevelFromKVOp{}, + num_symbols_in, + stream)); + + // Getting temporary storage requirements for the stable radix sort (sorting by stack level of the + // operations) + CUDA_TRY(cub::DeviceRadixSort::SortPairs(nullptr, + stack_level_sort_bytes, + d_kv_operations_unsigned, + d_symbol_positions_db, + num_symbols_in, + begin_bit, + end_bit, + stream)); + + // Getting temporary storage requirements for the scan to match pop operations with the latest + // push of the same level + CUDA_TRY(cub::DeviceScan::InclusiveScan( + nullptr, + match_level_scan_bytes, + kv_ops_scan_in, + kv_ops_scan_out, + detail::PopulatePopWithPush{symbol_to_stack_op}, + num_symbols_in, + stream)); + + // Getting temporary storage requirements for the scan to propagate top-of-stack for spots that + // didn't push or pop + CUDA_TRY(cub::DeviceScan::ExclusiveScan(nullptr, + propagate_writes_scan_bytes, + d_top_of_stack, + d_top_of_stack, + detail::PropagateLastWrite{read_symbol}, + empty_stack_symbol, + num_symbols_out, + stream)); + + // Scratch memory required by the algorithms + allocation_sizes[mem_alloc_id::temp_storage] = std::max({stack_level_scan_bytes, + stack_level_sort_bytes, + match_level_scan_bytes, + propagate_writes_scan_bytes}); + + // Memory requirements by auxiliary buffers + constexpr std::size_t extra_overlap_bytes = 2U; + allocation_sizes[mem_alloc_id::symbol_position_alt] = num_symbols_in * sizeof(SymbolPositionT); + allocation_sizes[mem_alloc_id::kv_ops_current] = + (num_symbols_in + extra_overlap_bytes) * sizeof(KeyValueOpT); + allocation_sizes[mem_alloc_id::kv_ops_alt] = + (num_symbols_in + extra_overlap_bytes) * sizeof(KeyValueOpT); + + // Try to alias into the user-provided temporary storage memory blob + CUDA_TRY(cub::AliasTemporaries( + d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + + // If this call was just to retrieve auxiliary memory requirements or not sufficient memory was + // provided + if (!d_temp_storage) { return; } + + //------------------------------------------------------------------------------ + // ALGORITHM + //------------------------------------------------------------------------------ + // Amount of temp storage available to CUB algorithms + std::size_t cub_temp_storage_bytes = allocation_sizes[mem_alloc_id::temp_storage]; + + // Temp storage for CUB algorithms + void* d_cub_temp_storage = allocations[mem_alloc_id::temp_storage]; + + // Initialize double-buffer for sorting the indexes of the sequence of sparse stack operations + d_symbol_positions_db = cub::DoubleBuffer{ + d_symbol_positions, + reinterpret_cast(allocations[mem_alloc_id::symbol_position_alt])}; + + // Initialize double-buffer for sorting the indexes of the sequence of sparse stack operations + d_kv_operations = cub::DoubleBuffer{ + reinterpret_cast(allocations[mem_alloc_id::kv_ops_current]), + reinterpret_cast(allocations[mem_alloc_id::kv_ops_alt])}; + + d_kv_operations_offset = + cub::DoubleBuffer{d_kv_operations.Current(), d_kv_operations.Alternate()}; + + // Compute prefix sum of the stack level after each operation + CUDA_TRY(cub::DeviceScan::InclusiveScan(d_cub_temp_storage, + cub_temp_storage_bytes, + stack_symbols_in, + d_kv_operations_offset.Current(), + detail::AddStackLevelFromKVOp{}, + num_symbols_in, + stream)); + + // Stable radix sort, sorting by stack level of the operations + d_kv_operations_unsigned = cub::DoubleBuffer{ + reinterpret_cast(d_kv_operations_offset.Current()), + reinterpret_cast(d_kv_operations_offset.Alternate())}; + CUDA_TRY(cub::DeviceRadixSort::SortPairs(d_cub_temp_storage, + cub_temp_storage_bytes, + d_kv_operations_unsigned, + d_symbol_positions_db, + num_symbols_in, + begin_bit, + end_bit, + stream)); + + // TransformInputIterator that remaps all operations on stack level 0 to the empty stack symbol + kv_ops_scan_in = {reinterpret_cast(d_kv_operations_unsigned.Current()), + detail::RemapEmptyStack{empty_stack}}; + kv_ops_scan_out = reinterpret_cast(d_kv_operations_unsigned.Alternate()); + + // Exclusive scan to match pop operations with the latest push operation of that level + CUDA_TRY(cub::DeviceScan::InclusiveScan( + d_cub_temp_storage, + cub_temp_storage_bytes, + kv_ops_scan_in, + kv_ops_scan_out, + detail::PopulatePopWithPush{symbol_to_stack_op}, + num_symbols_in, + stream)); + + // Fill the output tape with read-symbol + thrust::fill(thrust::cuda::par.on(stream), + thrust::device_ptr{d_top_of_stack}, + thrust::device_ptr{d_top_of_stack + num_symbols_out}, + read_symbol); + + // Transform the key-value operations to the stack symbol they represent + cub::TransformInputIterator + kv_op_to_stack_sym_it(kv_ops_scan_out, detail::KVOpToStackSymbol{}); + + // Scatter the stack symbols to the output tape (spots that are not scattered to have been + // pre-filled with the read-symbol) + thrust::scatter(thrust::cuda::par.on(stream), + kv_op_to_stack_sym_it, + kv_op_to_stack_sym_it + num_symbols_in, + d_symbol_positions_db.Current(), + d_top_of_stack); + + // We perform an exclusive scan in order to fill the items at the very left that may + // be reading the empty stack before there's the first push occurance in the sequence. + // Also, we're interested in the top-of-the-stack symbol before the operation was applied. + CUDA_TRY(cub::DeviceScan::ExclusiveScan(d_cub_temp_storage, + cub_temp_storage_bytes, + d_top_of_stack, + d_top_of_stack, + detail::PropagateLastWrite{read_symbol}, + empty_stack_symbol, + num_symbols_out, + stream)); +} + +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index eadcd985de3..1505c5cdd1b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -224,6 +224,7 @@ ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) +ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu) if(CUDF_ENABLE_ARROW_S3) target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED") endif() diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu new file mode 100644 index 00000000000..d2144226457 --- /dev/null +++ b/cpp/tests/io/fst/logical_stack_test.cu @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace { +namespace fst = cudf::io::fst; + +/** + * @brief Generates the sparse representation of stack operations to feed into the logical + * stack + * + * @param begin Forward input iterator to the first item of symbols that are checked for whether + * they push or pop + * @param end Forward input iterator to one one past the last item of symbols that are checked for + * whether they push or pop + * @param to_stack_op A function object that takes an instance of InputItT's value type and + * returns the kind of stack operation such item represents (i.e., of type stack_op_type) + * @param stack_symbol_out Forward output iterator to which symbols that either push or pop are + * assigned + * @param stack_op_index_out Forward output iterator to which the indexes of symbols that either + * push or pop are assigned + * @return Pair of iterators to one past the last item of the items written to \p stack_symbol_out + * and \p stack_op_index_out, respectively + */ +template +std::pair to_sparse_stack_symbols( + InputItT begin, + InputItT end, + ToStackOpTypeT to_stack_op, + StackSymbolOutItT stack_symbol_out, + StackOpIndexOutItT stack_op_index_out) +{ + std::size_t index = 0; + for (auto it = begin; it < end; it++) { + fst::stack_op_type op_type = to_stack_op(*it); + if (op_type == fst::stack_op_type::PUSH || op_type == fst::stack_op_type::POP) { + *stack_symbol_out = *it; + *stack_op_index_out = index; + stack_symbol_out++; + stack_op_index_out++; + } + index++; + } + return std::make_pair(stack_symbol_out, stack_op_index_out); +} + +/** + * @brief Reads in a sequence of items that represent stack operations, applies these operations to + * a stack, and, for every oepration being read in, outputs what was the symbol on top of the stack + * before the operations was applied. In case the stack is empty before any operation, + * \p empty_stack will be output instead. + * + * @tparam InputItT Forward input iterator type to items representing stack operations + * @tparam ToStackOpTypeT A transform function object class that maps an item representing a stack + * oepration to the stack_op_type of such item + * @tparam StackSymbolT Type representing items being pushed onto the stack + * @tparam TopOfStackOutItT A forward output iterator type being assigned items of StackSymbolT + * @param[in] begin Forward iterator to the beginning of the items representing stack operations + * @param[in] end Iterator to one past the last item representing the stack operation + * @param[in] to_stack_op A function object that takes an instance of InputItT's value type and + * returns the kind of stack operation such item represents (i.e., of type stack_op_type) + * @param[in] empty_stack A symbol that will be written to top_of_stack whenever the stack was empty + * @param[out] top_of_stack The output iterator to which the item will be written to + * @return TopOfStackOutItT Iterators to one past the last element that was written + */ +template +TopOfStackOutItT to_top_of_stack(InputItT begin, + InputItT end, + ToStackOpTypeT to_stack_op, + StackSymbolT empty_stack, + TopOfStackOutItT top_of_stack) +{ + std::stack stack; + for (auto it = begin; it < end; it++) { + // Write what is currently on top of the stack when reading in the current symbol + *top_of_stack = stack.empty() ? empty_stack : stack.top(); + top_of_stack++; + + auto const& current = *it; + fst::stack_op_type op_type = to_stack_op(current); + + // Check whether this symbol corresponds to a push or pop operation and modify the stack + // accordingly + if (op_type == fst::stack_op_type::PUSH) { + stack.push(current); + } else if (op_type == fst::stack_op_type::POP) { + stack.pop(); + } + } + return top_of_stack; +} + +/** + * @brief Funciton object used to filter for brackets and braces that represent push and pop + * operations + * + */ +struct JSONToStackOp { + template + __host__ __device__ __forceinline__ fst::stack_op_type operator()( + StackSymbolT const& stack_symbol) const + { + return (stack_symbol == '{' || stack_symbol == '[') ? fst::stack_op_type::PUSH + : (stack_symbol == '}' || stack_symbol == ']') ? fst::stack_op_type::POP + : fst::stack_op_type::READ; + } +}; +} // namespace + + +// Base test fixture for tests +struct LogicalStackTest : public cudf::test::BaseFixture { +}; + +TEST_F(LogicalStackTest, GroundTruth) +{ + // Type sufficient to cover any stack level (must be a signed type) + using StackLevelT = int8_t; + using SymbolT = char; + using SymbolOffsetT = uint32_t; + + // The stack symbol that we'll fill everywhere where there's nothing on the stack + constexpr SymbolT empty_stack_symbol = '_'; + + // This just has to be a stack symbol that may not be confused with a symbol that would push or + // pop + constexpr SymbolT read_symbol = 'x'; + + // Prepare cuda stream for data transfers & kernels + cudaStream_t stream = nullptr; + cudaStreamCreate(&stream); + rmm::cuda_stream_view stream_view(stream); + + // Test input, + std::string input = R"( { +"category": "reference", +"index:" [4,12,42], +"author": "Nigel Rees", +"title": "Sayings of the Century", +"price": 8.95 +} +{ +"category": "reference", +"index:" [4,{},null,{"a":[]}], +"author": "Nigel Rees", +"title": "Sayings of the Century", +"price": 8.95 +} )"; + + // Repeat input sample 1024x + for (std::size_t i = 0; i < 10; i++) + input += input; + + // Getting the symbols that actually modify the stack (i.e., symbols that push or pop) + std::string stack_symbols = ""; + std::vector stack_op_indexes; + stack_op_indexes.reserve(input.size()); + + // Get the sparse representation of stack operations + to_sparse_stack_symbols(std::cbegin(input), + std::cend(input), + JSONToStackOp{}, + std::back_inserter(stack_symbols), + std::back_inserter(stack_op_indexes)); + + // Prepare sparse stack ops + std::size_t num_stack_ops = stack_symbols.size(); + + rmm::device_uvector d_stack_ops(stack_symbols.size(), stream_view); + rmm::device_uvector d_stack_op_indexes(stack_op_indexes.size(), stream_view); + auto top_of_stack_gpu = hostdevice_vector(input.size(), stream_view); + + cudaMemcpyAsync(d_stack_ops.data(), + stack_symbols.data(), + stack_symbols.size() * sizeof(SymbolT), + cudaMemcpyHostToDevice, + stream); + + cudaMemcpyAsync(d_stack_op_indexes.data(), + stack_op_indexes.data(), + stack_op_indexes.size() * sizeof(SymbolOffsetT), + cudaMemcpyHostToDevice, + stream); + + // Prepare output + std::size_t string_size = input.size(); + SymbolT* d_top_of_stack = nullptr; + cudaMalloc(&d_top_of_stack, string_size + 1); + + // Request temporary storage requirements + std::size_t temp_storage_bytes = 0; + fst::SparseStackOpToTopOfStack(nullptr, + temp_storage_bytes, + d_stack_ops.data(), + d_stack_op_indexes.data(), + JSONToStackOp{}, + d_top_of_stack, + empty_stack_symbol, + read_symbol, + num_stack_ops, + string_size, + stream); + + // Allocate temporary storage required by the get-top-of-the-stack algorithm + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream_view); + + // Run algorithm + fst::SparseStackOpToTopOfStack(d_temp_storage.data(), + temp_storage_bytes, + d_stack_ops.data(), + d_stack_op_indexes.data(), + JSONToStackOp{}, + top_of_stack_gpu.device_ptr(), + empty_stack_symbol, + read_symbol, + num_stack_ops, + string_size, + stream); + + // Async copy results from device to host + top_of_stack_gpu.device_to_host(stream_view); + + // Get CPU-side results for verification + std::string top_of_stack_cpu{}; + top_of_stack_cpu.reserve(input.size()); + to_top_of_stack(std::cbegin(input), + std::cend(input), + JSONToStackOp{}, + empty_stack_symbol, + std::back_inserter(top_of_stack_cpu)); + + // Make sure results have been copied back to host + cudaStreamSynchronize(stream); + + // Verify results + ASSERT_EQ(input.size(), top_of_stack_cpu.size()); + for (size_t i = 0; i < input.size() && i < top_of_stack_cpu.size(); i++) { + ASSERT_EQ(top_of_stack_gpu.host_ptr()[i], top_of_stack_cpu[i]) << "Mismatch at index #" << i; + } +} + +CUDF_TEST_PROGRAM_MAIN() From 377358a41be1fa869ca9835ae00a8bf1d81734c3 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 29 Mar 2022 07:31:44 -0700 Subject: [PATCH 002/173] style fix & additional test scenario --- cpp/tests/io/fst/logical_stack_test.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu index d2144226457..389ac73e533 100644 --- a/cpp/tests/io/fst/logical_stack_test.cu +++ b/cpp/tests/io/fst/logical_stack_test.cu @@ -140,7 +140,6 @@ struct JSONToStackOp { }; } // namespace - // Base test fixture for tests struct LogicalStackTest : public cudf::test::BaseFixture { }; @@ -178,9 +177,9 @@ TEST_F(LogicalStackTest, GroundTruth) "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 -} )"; +} {} [] [ ])"; - // Repeat input sample 1024x + // Repeat input sample 1024x for (std::size_t i = 0; i < 10; i++) input += input; From 418600413fd0d8b001c11d4450613eea3190a552 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 29 Mar 2022 11:25:20 -0700 Subject: [PATCH 003/173] removed forceinline --- cpp/src/io/fst/logical_stack.cuh | 18 +++++++++--------- cpp/tests/io/fst/logical_stack_test.cu | 3 +-- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index 7069ee3b404..b68d53d742d 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -107,7 +107,7 @@ using UnsignedKeyValueOpType = typename KeyValueOpToUnsigned struct StackSymbolToKVOp { template - __host__ __device__ __forceinline__ KeyValueOpT operator()(StackSymbolT const& stack_symbol) const + __host__ __device__ KeyValueOpT operator()(StackSymbolT const& stack_symbol) const { stack_op_type stack_op = symbol_to_stack_op_type(stack_symbol); // PUSH => +1, POP => -1, READ => 0 @@ -127,8 +127,8 @@ struct StackSymbolToKVOp { */ struct AddStackLevelFromKVOp { template - __host__ __device__ __forceinline__ KeyValueOp operator()( - KeyValueOp const& lhs, KeyValueOp const& rhs) const + __host__ __device__ KeyValueOp operator()(KeyValueOp const& lhs, + KeyValueOp const& rhs) const { KeyT new_level = lhs.key + rhs.key; return KeyValueOp{new_level, rhs.value}; @@ -144,8 +144,8 @@ struct AddStackLevelFromKVOp { template struct PopulatePopWithPush { template - __host__ __device__ __forceinline__ KeyValueOp operator()( - KeyValueOp const& lhs, KeyValueOp const& rhs) const + __host__ __device__ KeyValueOp operator()(KeyValueOp const& lhs, + KeyValueOp const& rhs) const { // If RHS is a read, then we need to figure out whether we can propagate the value from the LHS bool is_rhs_read = symbol_to_stack_op_type(rhs.value) != stack_op_type::PUSH; @@ -168,8 +168,8 @@ struct PopulatePopWithPush { */ template struct PropagateLastWrite { - __host__ __device__ __forceinline__ StackSymbolT operator()(StackSymbolT const& lhs, - StackSymbolT const& rhs) const + __host__ __device__ StackSymbolT operator()(StackSymbolT const& lhs, + StackSymbolT const& rhs) const { // If RHS is a yet-to-be-propagated, then we need to check whether we can use the LHS to fill bool is_rhs_read = (rhs == read_symbol); @@ -190,7 +190,7 @@ struct PropagateLastWrite { */ struct KVOpToStackSymbol { template - __host__ __device__ __forceinline__ ValueT operator()(KeyValueOp const& kv_op) const + __host__ __device__ ValueT operator()(KeyValueOp const& kv_op) const { return kv_op.value; } @@ -201,7 +201,7 @@ struct KVOpToStackSymbol { */ template struct RemapEmptyStack { - __host__ __device__ __forceinline__ KeyValueOpT operator()(KeyValueOpT const& kv_op) const + __host__ __device__ KeyValueOpT operator()(KeyValueOpT const& kv_op) const { return kv_op.key == 0 ? empty_stack_symbol : kv_op; } diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu index 389ac73e533..9eb1a90c5fe 100644 --- a/cpp/tests/io/fst/logical_stack_test.cu +++ b/cpp/tests/io/fst/logical_stack_test.cu @@ -130,8 +130,7 @@ TopOfStackOutItT to_top_of_stack(InputItT begin, */ struct JSONToStackOp { template - __host__ __device__ __forceinline__ fst::stack_op_type operator()( - StackSymbolT const& stack_symbol) const + __host__ __device__ fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const { return (stack_symbol == '{' || stack_symbol == '[') ? fst::stack_op_type::PUSH : (stack_symbol == '}' || stack_symbol == ']') ? fst::stack_op_type::POP From a921c6671c94c2a019aec35f6f4858cd79188a5a Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 30 Mar 2022 03:23:48 -0700 Subject: [PATCH 004/173] tagging host device function --- cpp/src/io/fst/logical_stack.cuh | 13 +++++++------ cpp/tests/io/fst/logical_stack_test.cu | 3 ++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index b68d53d742d..f3595504245 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -24,6 +24,7 @@ #include #include +#include namespace cudf { namespace io { @@ -107,7 +108,7 @@ using UnsignedKeyValueOpType = typename KeyValueOpToUnsigned struct StackSymbolToKVOp { template - __host__ __device__ KeyValueOpT operator()(StackSymbolT const& stack_symbol) const + constexpr CUDF_HOST_DEVICE KeyValueOpT operator()(StackSymbolT const& stack_symbol) const { stack_op_type stack_op = symbol_to_stack_op_type(stack_symbol); // PUSH => +1, POP => -1, READ => 0 @@ -127,7 +128,7 @@ struct StackSymbolToKVOp { */ struct AddStackLevelFromKVOp { template - __host__ __device__ KeyValueOp operator()(KeyValueOp const& lhs, + constexpr CUDF_HOST_DEVICE KeyValueOp operator()(KeyValueOp const& lhs, KeyValueOp const& rhs) const { KeyT new_level = lhs.key + rhs.key; @@ -144,7 +145,7 @@ struct AddStackLevelFromKVOp { template struct PopulatePopWithPush { template - __host__ __device__ KeyValueOp operator()(KeyValueOp const& lhs, + constexpr CUDF_HOST_DEVICE KeyValueOp operator()(KeyValueOp const& lhs, KeyValueOp const& rhs) const { // If RHS is a read, then we need to figure out whether we can propagate the value from the LHS @@ -168,7 +169,7 @@ struct PopulatePopWithPush { */ template struct PropagateLastWrite { - __host__ __device__ StackSymbolT operator()(StackSymbolT const& lhs, + constexpr CUDF_HOST_DEVICE StackSymbolT operator()(StackSymbolT const& lhs, StackSymbolT const& rhs) const { // If RHS is a yet-to-be-propagated, then we need to check whether we can use the LHS to fill @@ -190,7 +191,7 @@ struct PropagateLastWrite { */ struct KVOpToStackSymbol { template - __host__ __device__ ValueT operator()(KeyValueOp const& kv_op) const + constexpr CUDF_HOST_DEVICE ValueT operator()(KeyValueOp const& kv_op) const { return kv_op.value; } @@ -201,7 +202,7 @@ struct KVOpToStackSymbol { */ template struct RemapEmptyStack { - __host__ __device__ KeyValueOpT operator()(KeyValueOpT const& kv_op) const + constexpr CUDF_HOST_DEVICE KeyValueOpT operator()(KeyValueOpT const& kv_op) const { return kv_op.key == 0 ? empty_stack_symbol : kv_op; } diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu index 9eb1a90c5fe..7d4564d3204 100644 --- a/cpp/tests/io/fst/logical_stack_test.cu +++ b/cpp/tests/io/fst/logical_stack_test.cu @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -130,7 +131,7 @@ TopOfStackOutItT to_top_of_stack(InputItT begin, */ struct JSONToStackOp { template - __host__ __device__ fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const + constexpr CUDF_HOST_DEVICE fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const { return (stack_symbol == '{' || stack_symbol == '[') ? fst::stack_op_type::PUSH : (stack_symbol == '}' || stack_symbol == ']') ? fst::stack_op_type::POP From 75a185397b08e0be2b38d8e56a4467cea207de9f Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 31 Mar 2022 04:11:44 -0700 Subject: [PATCH 005/173] Added utility to debug print & instrumented code to use it --- cpp/include/cudf_test/print_utilities.cuh | 129 ++++++++++++++++++++++ cpp/src/io/fst/logical_stack.cuh | 103 +++++++++++++---- 2 files changed, 213 insertions(+), 19 deletions(-) create mode 100644 cpp/include/cudf_test/print_utilities.cuh diff --git a/cpp/include/cudf_test/print_utilities.cuh b/cpp/include/cudf_test/print_utilities.cuh new file mode 100644 index 00000000000..04a8d8c9bea --- /dev/null +++ b/cpp/include/cudf_test/print_utilities.cuh @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cub/util_type.cuh" +#include +#include +#include + +namespace cudf { +namespace test { +namespace print { + +constexpr int32_t hex_tag = 0; + +template +struct TaggedType { + T v; +}; + +template +using hex_t = TaggedType; + +template +struct ToTaggedType { + template + CUDF_HOST_DEVICE WrappedTypeT operator()(T const& v) const + { + return WrappedTypeT{v}; + } +}; + +template +auto hex(InItT it) +{ + using value_t = typename std::iterator_traits::value_type; + using tagged_t = hex_t; + return cub::TransformInputIterator, InItT>( + it, ToTaggedType{}); +} + +template && std::is_signed_v>* = nullptr> +CUDF_HOST_DEVICE void print_value(int32_t width, T arg) +{ + printf("%*d", width, arg); +} + +template && std::is_unsigned_v>* = nullptr> +CUDF_HOST_DEVICE void print_value(int32_t width, T arg) +{ + printf("%*d", width, arg); +} + +CUDF_HOST_DEVICE void print_value(int32_t width, char arg) { printf("%*c", width, arg); } + +template +CUDF_HOST_DEVICE void print_value(int32_t width, hex_t arg) +{ + printf("%*X", width, arg.v); +} + +namespace detail +{ +template +CUDF_HOST_DEVICE void print_line(int32_t width, char delimiter, T arg) +{ + print_value(width, arg); +} + +template +CUDF_HOST_DEVICE void print_line(int32_t width, char delimiter, T arg, Ts... args) +{ + print_value(width, arg); + if (delimiter) printf("%c", delimiter); + print_line(width, delimiter, args...); +} + +template +__global__ void print_array_kernel(std::size_t count, int32_t width, char delimiter, Ts... args) +{ + if (threadIdx.x == 0 && blockIdx.x == 0) { + for (std::size_t i = 0; i < count; i++) { + printf("%6lu: ", i); + print_line(width, delimiter, args[i]...); + printf("\n"); + } + } +} +} + +/** + * @brief Prints \p count elements from each of the given device-accessible iterators. + * + * @param count The number of items to print from each device-accessible iterator + * @param stream The cuda stream to which the printing kernel shall be dispatched + * @param args List of iterators to be printed + */ +template +void print_array(std::size_t count, cudaStream_t stream, Ts... args) +{ + // The width to pad printed numbers to + constexpr int32_t width = 6; + + // Delimiter used for separating values from subsequent iterators + constexpr char delimiter = ','; + + // TODO we want this to compile to nothing dependnig on compiler flag, rather than runtime + if (std::getenv("CUDA_DBG_DUMP") != nullptr) { + detail::print_array_kernel<<<1, 1, 0, stream>>>(count, width, delimiter, args...); + } +} + +} // namespace print +} // namespace test +} // namespace cudf diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index f3595504245..b725f8fed3f 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -23,8 +23,9 @@ #include #include -#include #include +#include +#include namespace cudf { namespace io { @@ -128,8 +129,8 @@ struct StackSymbolToKVOp { */ struct AddStackLevelFromKVOp { template - constexpr CUDF_HOST_DEVICE KeyValueOp operator()(KeyValueOp const& lhs, - KeyValueOp const& rhs) const + constexpr CUDF_HOST_DEVICE KeyValueOp operator()( + KeyValueOp const& lhs, KeyValueOp const& rhs) const { KeyT new_level = lhs.key + rhs.key; return KeyValueOp{new_level, rhs.value}; @@ -145,8 +146,8 @@ struct AddStackLevelFromKVOp { template struct PopulatePopWithPush { template - constexpr CUDF_HOST_DEVICE KeyValueOp operator()(KeyValueOp const& lhs, - KeyValueOp const& rhs) const + constexpr CUDF_HOST_DEVICE KeyValueOp operator()( + KeyValueOp const& lhs, KeyValueOp const& rhs) const { // If RHS is a read, then we need to figure out whether we can propagate the value from the LHS bool is_rhs_read = symbol_to_stack_op_type(rhs.value) != stack_op_type::PUSH; @@ -170,7 +171,7 @@ struct PopulatePopWithPush { template struct PropagateLastWrite { constexpr CUDF_HOST_DEVICE StackSymbolT operator()(StackSymbolT const& lhs, - StackSymbolT const& rhs) const + StackSymbolT const& rhs) const { // If RHS is a yet-to-be-propagated, then we need to check whether we can use the LHS to fill bool is_rhs_read = (rhs == read_symbol); @@ -209,6 +210,46 @@ struct RemapEmptyStack { KeyValueOpT empty_stack_symbol; }; +/** + * @brief Function object to return only the key part from a KeyValueOp instance. + */ +struct KVOpToKey { + template + constexpr CUDF_HOST_DEVICE KeyT operator()(KeyValueOp const& kv_op) const + { + return kv_op.key; + } +}; + +/** + * @brief Function object to return only the value part from a KeyValueOp instance. + */ +struct KVOpToValue { + template + constexpr CUDF_HOST_DEVICE ValueT operator()(KeyValueOp const& kv_op) const + { + return kv_op.value; + } +}; + +/** + * @brief Retrieves an iterator that returns only the `key` part from a KeyValueOp iterator. + */ +template +auto get_key_it(KeyValueOpItT it) +{ + return thrust::make_transform_iterator(it, KVOpToKey{}); +} + +/** + * @brief Retrieves an iterator that returns only the `value` part from a KeyValueOp iterator. + */ +template +auto get_value_it(KeyValueOpItT it) +{ + return thrust::make_transform_iterator(it, KVOpToValue{}); +} + } // namespace detail /** @@ -294,11 +335,7 @@ void SparseStackOpToTopOfStack(void* d_temp_storage, // Double-buffer for sorting the key-value store operations cub::DoubleBuffer d_kv_operations{nullptr, nullptr}; - // A double-buffer that aliases memory from d_kv_operations but offset by one item (to discard the - // exclusive scans first item) - cub::DoubleBuffer d_kv_operations_offset{nullptr, nullptr}; - - // A double-buffer that aliases memory from d_kv_operations_offset with unsigned types in order to + // A double-buffer that aliases memory from d_kv_operations with unsigned types in order to // be able to perform a radix sort cub::DoubleBuffer d_kv_operations_unsigned{nullptr, nullptr}; @@ -338,7 +375,7 @@ void SparseStackOpToTopOfStack(void* d_temp_storage, CUDA_TRY(cub::DeviceScan::InclusiveScan(nullptr, stack_level_scan_bytes, stack_symbols_in, - d_kv_operations_offset.Current(), + d_kv_operations.Current(), detail::AddStackLevelFromKVOp{}, num_symbols_in, stream)); @@ -417,22 +454,27 @@ void SparseStackOpToTopOfStack(void* d_temp_storage, reinterpret_cast(allocations[mem_alloc_id::kv_ops_current]), reinterpret_cast(allocations[mem_alloc_id::kv_ops_alt])}; - d_kv_operations_offset = - cub::DoubleBuffer{d_kv_operations.Current(), d_kv_operations.Alternate()}; - // Compute prefix sum of the stack level after each operation CUDA_TRY(cub::DeviceScan::InclusiveScan(d_cub_temp_storage, cub_temp_storage_bytes, stack_symbols_in, - d_kv_operations_offset.Current(), + d_kv_operations.Current(), detail::AddStackLevelFromKVOp{}, num_symbols_in, stream)); + // Dump info on stack operations: (stack level change + symbol) -> (absolute stack level + symbol) + test::print::print_array(num_symbols_in, + stream, + get_key_it(stack_symbols_in), + get_value_it(stack_symbols_in), + get_key_it(d_kv_operations.Current()), + get_value_it(d_kv_operations.Current())); + // Stable radix sort, sorting by stack level of the operations - d_kv_operations_unsigned = cub::DoubleBuffer{ - reinterpret_cast(d_kv_operations_offset.Current()), - reinterpret_cast(d_kv_operations_offset.Alternate())}; + d_kv_operations_unsigned = + cub::DoubleBuffer{reinterpret_cast(d_kv_operations.Current()), + reinterpret_cast(d_kv_operations.Alternate())}; CUDA_TRY(cub::DeviceRadixSort::SortPairs(d_cub_temp_storage, cub_temp_storage_bytes, d_kv_operations_unsigned, @@ -447,6 +489,11 @@ void SparseStackOpToTopOfStack(void* d_temp_storage, detail::RemapEmptyStack{empty_stack}}; kv_ops_scan_out = reinterpret_cast(d_kv_operations_unsigned.Alternate()); + // Dump info on stack operations sorted by their stack level (i.e. stack level after applying + // operation) + test::print::print_array( + num_symbols_in, stream, get_key_it(kv_ops_scan_in), get_value_it(kv_ops_scan_in)); + // Exclusive scan to match pop operations with the latest push operation of that level CUDA_TRY(cub::DeviceScan::InclusiveScan( d_cub_temp_storage, @@ -457,6 +504,15 @@ void SparseStackOpToTopOfStack(void* d_temp_storage, num_symbols_in, stream)); + // Dump info on stack operations sorted by their stack level (i.e. stack level after applying + // operation) + test::print::print_array(num_symbols_in, + stream, + get_key_it(kv_ops_scan_in), + get_value_it(kv_ops_scan_in), + get_key_it(kv_ops_scan_out), + get_value_it(kv_ops_scan_out)); + // Fill the output tape with read-symbol thrust::fill(thrust::cuda::par.on(stream), thrust::device_ptr{d_top_of_stack}, @@ -475,6 +531,11 @@ void SparseStackOpToTopOfStack(void* d_temp_storage, d_symbol_positions_db.Current(), d_top_of_stack); + // Dump the output tape that has many yet-to-be-filled spots (i.e., all spots that were not given + // in the sparse representation) + test::print::print_array( + std::min(num_symbols_in, static_cast(10000)), stream, d_top_of_stack); + // We perform an exclusive scan in order to fill the items at the very left that may // be reading the empty stack before there's the first push occurance in the sequence. // Also, we're interested in the top-of-the-stack symbol before the operation was applied. @@ -486,6 +547,10 @@ void SparseStackOpToTopOfStack(void* d_temp_storage, empty_stack_symbol, num_symbols_out, stream)); + + // Dump the final output + test::print::print_array( + std::min(num_symbols_in, static_cast(10000)), stream, d_top_of_stack); } } // namespace fst From a23668a66dd7e168f5b486d8e9af4c2c29f97fe7 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 31 Mar 2022 05:28:17 -0700 Subject: [PATCH 006/173] switched to using rmm also inside algorithm --- cpp/src/io/fst/logical_stack.cuh | 85 +++++++++----------------- cpp/tests/io/fst/logical_stack_test.cu | 22 ++----- 2 files changed, 33 insertions(+), 74 deletions(-) diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index b725f8fed3f..9550798aeaf 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -27,6 +27,9 @@ #include #include +#include +#include + namespace cudf { namespace io { namespace fst { @@ -295,8 +298,7 @@ template -void SparseStackOpToTopOfStack(void* d_temp_storage, - size_t& temp_storage_bytes, +void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, StackSymbolItT d_symbols, SymbolPositionT* d_symbol_positions, StackSymbolToStackOpT symbol_to_stack_op, @@ -351,20 +353,6 @@ void SparseStackOpToTopOfStack(void* d_temp_storage, kv_ops_scan_in(nullptr, detail::RemapEmptyStack{empty_stack}); KeyValueOpT* kv_ops_scan_out = nullptr; - //------------------------------------------------------------------------------ - // MEMORY REQUIREMENTS - //------------------------------------------------------------------------------ - enum mem_alloc_id { - temp_storage = 0, - symbol_position_alt, - kv_ops_current, - kv_ops_alt, - num_allocations - }; - - void* allocations[mem_alloc_id::num_allocations] = {nullptr}; - std::size_t allocation_sizes[mem_alloc_id::num_allocations] = {0}; - std::size_t stack_level_scan_bytes = 0; std::size_t stack_level_sort_bytes = 0; std::size_t match_level_scan_bytes = 0; @@ -414,49 +402,34 @@ void SparseStackOpToTopOfStack(void* d_temp_storage, stream)); // Scratch memory required by the algorithms - allocation_sizes[mem_alloc_id::temp_storage] = std::max({stack_level_scan_bytes, - stack_level_sort_bytes, - match_level_scan_bytes, - propagate_writes_scan_bytes}); - - // Memory requirements by auxiliary buffers - constexpr std::size_t extra_overlap_bytes = 2U; - allocation_sizes[mem_alloc_id::symbol_position_alt] = num_symbols_in * sizeof(SymbolPositionT); - allocation_sizes[mem_alloc_id::kv_ops_current] = - (num_symbols_in + extra_overlap_bytes) * sizeof(KeyValueOpT); - allocation_sizes[mem_alloc_id::kv_ops_alt] = - (num_symbols_in + extra_overlap_bytes) * sizeof(KeyValueOpT); - - // Try to alias into the user-provided temporary storage memory blob - CUDA_TRY(cub::AliasTemporaries( - d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); - - // If this call was just to retrieve auxiliary memory requirements or not sufficient memory was - // provided - if (!d_temp_storage) { return; } + auto total_temp_storage_bytes = std::max({stack_level_scan_bytes, + stack_level_sort_bytes, + match_level_scan_bytes, + propagate_writes_scan_bytes}); + + if (temp_storage.size() < total_temp_storage_bytes) { + temp_storage.resize(total_temp_storage_bytes, stream); + } + // Actual device buffer size, as we need to pass in an lvalue-ref to cub algorithms as temp_storage_bytes + total_temp_storage_bytes = temp_storage.size(); + + rmm::device_uvector d_symbol_position_alt{num_symbols_in, stream}; + rmm::device_uvector d_kv_ops_current{num_symbols_in, stream}; + rmm::device_uvector d_kv_ops_alt{num_symbols_in, stream}; //------------------------------------------------------------------------------ // ALGORITHM //------------------------------------------------------------------------------ - // Amount of temp storage available to CUB algorithms - std::size_t cub_temp_storage_bytes = allocation_sizes[mem_alloc_id::temp_storage]; - - // Temp storage for CUB algorithms - void* d_cub_temp_storage = allocations[mem_alloc_id::temp_storage]; - // Initialize double-buffer for sorting the indexes of the sequence of sparse stack operations - d_symbol_positions_db = cub::DoubleBuffer{ - d_symbol_positions, - reinterpret_cast(allocations[mem_alloc_id::symbol_position_alt])}; + d_symbol_positions_db = + cub::DoubleBuffer{d_symbol_positions, d_symbol_position_alt.data()}; // Initialize double-buffer for sorting the indexes of the sequence of sparse stack operations - d_kv_operations = cub::DoubleBuffer{ - reinterpret_cast(allocations[mem_alloc_id::kv_ops_current]), - reinterpret_cast(allocations[mem_alloc_id::kv_ops_alt])}; + d_kv_operations = cub::DoubleBuffer{d_kv_ops_current.data(), d_kv_ops_alt.data()}; // Compute prefix sum of the stack level after each operation - CUDA_TRY(cub::DeviceScan::InclusiveScan(d_cub_temp_storage, - cub_temp_storage_bytes, + CUDA_TRY(cub::DeviceScan::InclusiveScan(temp_storage.data(), + total_temp_storage_bytes, stack_symbols_in, d_kv_operations.Current(), detail::AddStackLevelFromKVOp{}, @@ -475,8 +448,8 @@ void SparseStackOpToTopOfStack(void* d_temp_storage, d_kv_operations_unsigned = cub::DoubleBuffer{reinterpret_cast(d_kv_operations.Current()), reinterpret_cast(d_kv_operations.Alternate())}; - CUDA_TRY(cub::DeviceRadixSort::SortPairs(d_cub_temp_storage, - cub_temp_storage_bytes, + CUDA_TRY(cub::DeviceRadixSort::SortPairs(temp_storage.data(), + total_temp_storage_bytes, d_kv_operations_unsigned, d_symbol_positions_db, num_symbols_in, @@ -496,8 +469,8 @@ void SparseStackOpToTopOfStack(void* d_temp_storage, // Exclusive scan to match pop operations with the latest push operation of that level CUDA_TRY(cub::DeviceScan::InclusiveScan( - d_cub_temp_storage, - cub_temp_storage_bytes, + temp_storage.data(), + total_temp_storage_bytes, kv_ops_scan_in, kv_ops_scan_out, detail::PopulatePopWithPush{symbol_to_stack_op}, @@ -539,8 +512,8 @@ void SparseStackOpToTopOfStack(void* d_temp_storage, // We perform an exclusive scan in order to fill the items at the very left that may // be reading the empty stack before there's the first push occurance in the sequence. // Also, we're interested in the top-of-the-stack symbol before the operation was applied. - CUDA_TRY(cub::DeviceScan::ExclusiveScan(d_cub_temp_storage, - cub_temp_storage_bytes, + CUDA_TRY(cub::DeviceScan::ExclusiveScan(temp_storage.data(), + total_temp_storage_bytes, d_top_of_stack, d_top_of_stack, detail::PropagateLastWrite{read_symbol}, diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu index 7d4564d3204..3b860867cf2 100644 --- a/cpp/tests/io/fst/logical_stack_test.cu +++ b/cpp/tests/io/fst/logical_stack_test.cu @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -217,28 +218,13 @@ TEST_F(LogicalStackTest, GroundTruth) // Prepare output std::size_t string_size = input.size(); SymbolT* d_top_of_stack = nullptr; - cudaMalloc(&d_top_of_stack, string_size + 1); - - // Request temporary storage requirements - std::size_t temp_storage_bytes = 0; - fst::SparseStackOpToTopOfStack(nullptr, - temp_storage_bytes, - d_stack_ops.data(), - d_stack_op_indexes.data(), - JSONToStackOp{}, - d_top_of_stack, - empty_stack_symbol, - read_symbol, - num_stack_ops, - string_size, - stream); + cudaMalloc(&d_top_of_stack, string_size); // Allocate temporary storage required by the get-top-of-the-stack algorithm - rmm::device_buffer d_temp_storage(temp_storage_bytes, stream_view); + rmm::device_buffer d_temp_storage{}; // Run algorithm - fst::SparseStackOpToTopOfStack(d_temp_storage.data(), - temp_storage_bytes, + fst::SparseStackOpToTopOfStack(d_temp_storage, d_stack_ops.data(), d_stack_op_indexes.data(), JSONToStackOp{}, From aa5f5c43142ddb63e9e849a275872e331c90ac48 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 31 Mar 2022 09:34:41 -0700 Subject: [PATCH 007/173] header include order & SFINAE macro --- cpp/include/cudf_test/print_utilities.cuh | 18 ++++++++++-------- cpp/src/io/fst/logical_stack.cuh | 20 +++++++++++--------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf_test/print_utilities.cuh b/cpp/include/cudf_test/print_utilities.cuh index 04a8d8c9bea..6e6fff84cec 100644 --- a/cpp/include/cudf_test/print_utilities.cuh +++ b/cpp/include/cudf_test/print_utilities.cuh @@ -16,9 +16,12 @@ #pragma once +#include +#include + #include "cub/util_type.cuh" #include -#include + #include namespace cudf { @@ -53,13 +56,13 @@ auto hex(InItT it) it, ToTaggedType{}); } -template && std::is_signed_v>* = nullptr> +template && std::is_signed_v)> CUDF_HOST_DEVICE void print_value(int32_t width, T arg) { printf("%*d", width, arg); } -template && std::is_unsigned_v>* = nullptr> +template && std::is_unsigned_v)> CUDF_HOST_DEVICE void print_value(int32_t width, T arg) { printf("%*d", width, arg); @@ -73,8 +76,7 @@ CUDF_HOST_DEVICE void print_value(int32_t width, hex_t arg) printf("%*X", width, arg.v); } -namespace detail -{ +namespace detail { template CUDF_HOST_DEVICE void print_line(int32_t width, char delimiter, T arg) { @@ -100,11 +102,11 @@ __global__ void print_array_kernel(std::size_t count, int32_t width, char delimi } } } -} +} // namespace detail /** * @brief Prints \p count elements from each of the given device-accessible iterators. - * + * * @param count The number of items to print from each device-accessible iterator * @param stream The cuda stream to which the printing kernel shall be dispatched * @param args List of iterators to be printed @@ -113,7 +115,7 @@ template void print_array(std::size_t count, cudaStream_t stream, Ts... args) { // The width to pad printed numbers to - constexpr int32_t width = 6; + constexpr int32_t width = 6; // Delimiter used for separating values from subsequent iterators constexpr char delimiter = ','; diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index 9550798aeaf..ced1a712d6a 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -15,14 +15,6 @@ */ #pragma once -#include -#include -#include -#include -#include -#include -#include - #include #include #include @@ -30,6 +22,16 @@ #include #include +#include +#include +#include +#include + +#include + +#include +#include + namespace cudf { namespace io { namespace fst { @@ -307,7 +309,7 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, StackSymbolT read_symbol, OffsetT num_symbols_in, OffsetT num_symbols_out, - cudaStream_t stream = nullptr) + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { // Type used to hold key-value pairs (key being the stack level and the value being the stack // symbol) From 4ee2253926943d025695f68ec22d8e4de5eab49c Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Sun, 3 Apr 2022 23:46:00 -0700 Subject: [PATCH 008/173] debug print cleanups --- cpp/include/cudf_test/print_utilities.cuh | 31 ++++++++++++++++------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf_test/print_utilities.cuh b/cpp/include/cudf_test/print_utilities.cuh index 6e6fff84cec..1da7b9836b1 100644 --- a/cpp/include/cudf_test/print_utilities.cuh +++ b/cpp/include/cudf_test/print_utilities.cuh @@ -38,15 +38,28 @@ struct TaggedType { template using hex_t = TaggedType; -template +/** + * @brief Function object to transform a built-in type to a tagged type (e.g., in order to print + * values from an iterator returning uint32_t as hex values) + * + * @tparam TaggedTypeT A TaggedType template specialisation + */ +template struct ToTaggedType { template - CUDF_HOST_DEVICE WrappedTypeT operator()(T const& v) const + CUDF_HOST_DEVICE TaggedTypeT operator()(T const& v) const { - return WrappedTypeT{v}; + return TaggedTypeT{v}; } }; +/** + * @brief Returns an iterator that causes the values from \p it to be printed as hex values. + * + * @tparam InItT A random-access input iterator type + * @param it A random-access input iterator t + * @return + */ template auto hex(InItT it) { @@ -56,13 +69,13 @@ auto hex(InItT it) it, ToTaggedType{}); } -template && std::is_signed_v)> +template && std::is_signed_v)> CUDF_HOST_DEVICE void print_value(int32_t width, T arg) { printf("%*d", width, arg); } -template && std::is_unsigned_v)> +template && std::is_unsigned_v)> CUDF_HOST_DEVICE void print_value(int32_t width, T arg) { printf("%*d", width, arg); @@ -78,17 +91,17 @@ CUDF_HOST_DEVICE void print_value(int32_t width, hex_t arg) namespace detail { template -CUDF_HOST_DEVICE void print_line(int32_t width, char delimiter, T arg) +CUDF_HOST_DEVICE void print_values(int32_t width, char delimiter, T arg) { print_value(width, arg); } template -CUDF_HOST_DEVICE void print_line(int32_t width, char delimiter, T arg, Ts... args) +CUDF_HOST_DEVICE void print_values(int32_t width, char delimiter, T arg, Ts... args) { print_value(width, arg); if (delimiter) printf("%c", delimiter); - print_line(width, delimiter, args...); + print_values(width, delimiter, args...); } template @@ -97,7 +110,7 @@ __global__ void print_array_kernel(std::size_t count, int32_t width, char delimi if (threadIdx.x == 0 && blockIdx.x == 0) { for (std::size_t i = 0; i < count; i++) { printf("%6lu: ", i); - print_line(width, delimiter, args[i]...); + print_values(width, delimiter, args[i]...); printf("\n"); } } From 0f3585296c7a91a0eeee8aeceb02ad8c5a27be42 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 4 Apr 2022 02:28:30 -0700 Subject: [PATCH 009/173] renaming key-value store op to stack_op --- cpp/src/io/fst/logical_stack.cuh | 216 +++++++++++++++---------------- 1 file changed, 101 insertions(+), 115 deletions(-) diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index ced1a712d6a..bce362beff9 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -48,10 +48,10 @@ enum class stack_op_type : int32_t { namespace detail { /** - * @brief A convenience struct that represents a stack opepration as a key-value pair, where the key + * @brief A convenience struct that represents a stack opepration as a pair, where the stack_level * represents the stack's level and the value represents the stack symbol. * - * @tparam KeyT The key type sufficient to cover all stack levels. Must be signed type as any + * @tparam StackLevelT The stack level type sufficient to cover all stack levels. Must be signed type as any * subsequence of stack operations must be able to be covered. E.g., consider the first 10 * operations are all push and the last 10 operations are all pop operations, we need to be able to * represent a partial aggregate of the first ten items, which is '+10', just as well as a partial @@ -59,69 +59,69 @@ namespace detail { * @tparam ValueT The value type that corresponds to the stack symbols (i.e., covers the stack * alphabet). */ -template -struct KeyValueOp { - KeyT key; +template +struct StackOp { + StackLevelT stack_level; ValueT value; }; /** - * @brief Helper class to assist with radix sorting KeyValueOp instances by key. + * @brief Helper class to assist with radix sorting StackOp instances by stack level. * - * @tparam BYTE_SIZE The size of the KeyValueOp. + * @tparam BYTE_SIZE The size of the StackOp. */ template -struct KeyValueOpToUnsigned { +struct StackOpToUnsigned { }; template <> -struct KeyValueOpToUnsigned<1U> { +struct StackOpToUnsigned<1U> { using UnsignedT = uint8_t; }; template <> -struct KeyValueOpToUnsigned<2U> { +struct StackOpToUnsigned<2U> { using UnsignedT = uint16_t; }; template <> -struct KeyValueOpToUnsigned<4U> { +struct StackOpToUnsigned<4U> { using UnsignedT = uint32_t; }; template <> -struct KeyValueOpToUnsigned<8U> { +struct StackOpToUnsigned<8U> { using UnsignedT = uint64_t; }; /** * @brief Alias template to retrieve an unsigned bit-representation that can be used for radix - * sorting the key of a KeyValueOp. + * sorting the stack level of a StackOp. * - * @tparam KeyValueOpT The KeyValueOp class template instance for which to get an unsigned + * @tparam StackOpT The StackOp class template instance for which to get an unsigned * bit-representation */ -template -using UnsignedKeyValueOpType = typename KeyValueOpToUnsigned::UnsignedT; +template +using UnsignedStackOpType = typename StackOpToUnsigned::UnsignedT; /** - * @brief Function object class template used for converting a stack operation to a key-value store - * operation, where the key corresponds to the stack level being accessed. + * @brief Function object class template used for converting a stack symbol to a stack + * operation that has a stack level to which an operation applies. * - * @tparam KeyValueOpT + * @tparam StackOpT * @tparam StackSymbolToStackOpTypeT */ -template -struct StackSymbolToKVOp { +template +struct StackSymbolToStackOp { template - constexpr CUDF_HOST_DEVICE KeyValueOpT operator()(StackSymbolT const& stack_symbol) const + constexpr CUDF_HOST_DEVICE StackOpT operator()(StackSymbolT const& stack_symbol) const { stack_op_type stack_op = symbol_to_stack_op_type(stack_symbol); // PUSH => +1, POP => -1, READ => 0 int32_t level_delta = stack_op == stack_op_type::PUSH ? 1 : stack_op == stack_op_type::POP ? -1 : 0; - return KeyValueOpT{static_cast(level_delta), stack_symbol}; + return StackOpT{static_cast(level_delta), stack_symbol}; } /// Function object returning a stack operation type for a given stack symbol @@ -132,27 +132,27 @@ struct StackSymbolToKVOp { * @brief Binary reduction operator to compute the absolute stack level from relative stack levels * (i.e., +1 for a PUSH, -1 for a POP operation). */ -struct AddStackLevelFromKVOp { - template - constexpr CUDF_HOST_DEVICE KeyValueOp operator()( - KeyValueOp const& lhs, KeyValueOp const& rhs) const +struct AddStackLevelFromStackOp { + template + constexpr CUDF_HOST_DEVICE StackOp operator()( + StackOp const& lhs, StackOp const& rhs) const { - KeyT new_level = lhs.key + rhs.key; - return KeyValueOp{new_level, rhs.value}; + StackLevelT new_level = lhs.stack_level + rhs.stack_level; + return StackOp{new_level, rhs.value}; } }; /** - * @brief Binary reduction operator that propagates a write operation for a specific key to all - * reads of that same key. That is, if the key of LHS compares equal to the key of the RHS and if + * @brief Binary reduction operator that propagates a write operation for a specific stack level to all + * reads of that same stack level. That is, if the stack level of LHS compares equal to the stack level of the RHS and if * the RHS is a read and the LHS is a write operation type, then we return LHS, otherwise we return * the RHS. */ template struct PopulatePopWithPush { - template - constexpr CUDF_HOST_DEVICE KeyValueOp operator()( - KeyValueOp const& lhs, KeyValueOp const& rhs) const + template + constexpr CUDF_HOST_DEVICE StackOp operator()( + StackOp const& lhs, StackOp const& rhs) const { // If RHS is a read, then we need to figure out whether we can propagate the value from the LHS bool is_rhs_read = symbol_to_stack_op_type(rhs.value) != stack_op_type::PUSH; @@ -160,7 +160,7 @@ struct PopulatePopWithPush { // Whether LHS is a matching write (i.e., the push operation that is on top of the stack for the // RHS's read) bool is_lhs_matching_write = - (lhs.key == rhs.key) && symbol_to_stack_op_type(lhs.value) == stack_op_type::PUSH; + (lhs.stack_level == rhs.stack_level) && symbol_to_stack_op_type(lhs.value) == stack_op_type::PUSH; return (is_rhs_read && is_lhs_matching_write) ? lhs : rhs; } @@ -192,12 +192,12 @@ struct PropagateLastWrite { }; /** - * @brief Helper function object class to convert a KeyValueOp to the stack symbol of that - * KeyValueOp. + * @brief Helper function object class to convert a StackOp to the stack symbol of that + * StackOp. */ -struct KVOpToStackSymbol { - template - constexpr CUDF_HOST_DEVICE ValueT operator()(KeyValueOp const& kv_op) const +struct StackOpToStackSymbol { + template + constexpr CUDF_HOST_DEVICE ValueT operator()(StackOp const& kv_op) const { return kv_op.value; } @@ -206,53 +206,42 @@ struct KVOpToStackSymbol { /** * @brief Replaces all operations that apply to stack level '0' with the empty stack symbol */ -template +template struct RemapEmptyStack { - constexpr CUDF_HOST_DEVICE KeyValueOpT operator()(KeyValueOpT const& kv_op) const + constexpr CUDF_HOST_DEVICE StackOpT operator()(StackOpT const& kv_op) const { - return kv_op.key == 0 ? empty_stack_symbol : kv_op; + return kv_op.stack_level == 0 ? empty_stack_symbol : kv_op; } - KeyValueOpT empty_stack_symbol; + StackOpT empty_stack_symbol; }; /** - * @brief Function object to return only the key part from a KeyValueOp instance. + * @brief Function object to return only the stack_level part from a StackOp instance. */ -struct KVOpToKey { - template - constexpr CUDF_HOST_DEVICE KeyT operator()(KeyValueOp const& kv_op) const +struct StackOpToStackLevel { + template + constexpr CUDF_HOST_DEVICE StackLevelT operator()(StackOp const& kv_op) const { - return kv_op.key; + return kv_op.stack_level; } }; /** - * @brief Function object to return only the value part from a KeyValueOp instance. + * @brief Retrieves an iterator that returns only the `stack_level` part from a StackOp iterator. */ -struct KVOpToValue { - template - constexpr CUDF_HOST_DEVICE ValueT operator()(KeyValueOp const& kv_op) const - { - return kv_op.value; - } -}; - -/** - * @brief Retrieves an iterator that returns only the `key` part from a KeyValueOp iterator. - */ -template -auto get_key_it(KeyValueOpItT it) +template +auto get_stack_level_it(StackOpItT it) { - return thrust::make_transform_iterator(it, KVOpToKey{}); + return thrust::make_transform_iterator(it, StackOpToStackLevel{}); } /** - * @brief Retrieves an iterator that returns only the `value` part from a KeyValueOp iterator. + * @brief Retrieves an iterator that returns only the `value` part from a StackOp iterator. */ -template -auto get_value_it(KeyValueOpItT it) +template +auto get_value_it(StackOpItT it) { - return thrust::make_transform_iterator(it, KVOpToValue{}); + return thrust::make_transform_iterator(it, StackOpToStackSymbol{}); } } // namespace detail @@ -268,7 +257,7 @@ auto get_value_it(KeyValueOpItT it) * @tparam StackSymbolItT An input iterator type that provides the sequence of symbols that * represent stack operations * @tparam SymbolPositionT The index that this stack operation is supposed to apply to - * @tparam StackSymbolToStackOpT Function object class to transform items from StackSymbolItT to + * @tparam StackSymbolToStackOpTypeT Function object class to transform items from StackSymbolItT to * stack_op_type * @tparam TopOfStackOutItT Output iterator type to which StackSymbolT are being assigned * @tparam StackSymbolT The internal type being used (usually corresponding to StackSymbolItT's @@ -296,14 +285,14 @@ auto get_value_it(KeyValueOpItT it) template void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, StackSymbolItT d_symbols, SymbolPositionT* d_symbol_positions, - StackSymbolToStackOpT symbol_to_stack_op, + StackSymbolToStackOpTypeT symbol_to_stack_op, TopOfStackOutItT d_top_of_stack, StackSymbolT empty_stack_symbol, StackSymbolT read_symbol, @@ -311,49 +300,46 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, OffsetT num_symbols_out, rmm::cuda_stream_view stream = rmm::cuda_stream_default) { - // Type used to hold key-value pairs (key being the stack level and the value being the stack - // symbol) - using KeyValueOpT = detail::KeyValueOp; + // Type used to hold pairs of (stack_level, value) pairs + using StackOpT = detail::StackOp; - // The unsigned integer type that we use for radix sorting items of type KeyValueOpT - using KVOpUnsignedT = detail::UnsignedKeyValueOpType; + // The unsigned integer type that we use for radix sorting items of type StackOpT + using StackOpUnsignedT = detail::UnsignedStackOpType; - // Transforming sequence of stack symbols to key-value store operations, where the key corresponds - // to the stack level of a given stack operation and the value corresponds to the stack symbol of - // that operation - using StackSymbolToKVOpT = detail::StackSymbolToKVOp; + // Transforming sequence of stack symbols to stack operations + using StackSymbolToStackOpT = detail::StackSymbolToStackOp; - // TransformInputIterator converting stack symbols to key-value store operations + // TransformInputIterator converting stack symbols to stack operations using TransformInputItT = - cub::TransformInputIterator; + cub::TransformInputIterator; - // Converting a stack symbol that may either push or pop to a key-value store operation: + // Converting a stack symbol that may either push or pop to a stack operation: // stack_symbol -> ([+1,0,-1], stack_symbol) - StackSymbolToKVOpT stack_sym_to_kv_op{symbol_to_stack_op}; + StackSymbolToStackOpT stack_sym_to_kv_op{symbol_to_stack_op}; TransformInputItT stack_symbols_in(d_symbols, stack_sym_to_kv_op); // Double-buffer for sorting along the given sequence of symbol positions (the sparse // representation) cub::DoubleBuffer d_symbol_positions_db{nullptr, nullptr}; - // Double-buffer for sorting the key-value store operations - cub::DoubleBuffer d_kv_operations{nullptr, nullptr}; + // Double-buffer for sorting the stack operations by the stack level to which such operation applies + cub::DoubleBuffer d_kv_operations{nullptr, nullptr}; // A double-buffer that aliases memory from d_kv_operations with unsigned types in order to // be able to perform a radix sort - cub::DoubleBuffer d_kv_operations_unsigned{nullptr, nullptr}; + cub::DoubleBuffer d_kv_operations_unsigned{nullptr, nullptr}; constexpr std::size_t bits_per_byte = 8; - constexpr std::size_t begin_bit = offsetof(KeyValueOpT, key) * bits_per_byte; - constexpr std::size_t end_bit = begin_bit + (sizeof(KeyValueOpT::key) * bits_per_byte); + constexpr std::size_t begin_bit = offsetof(StackOpT, stack_level) * bits_per_byte; + constexpr std::size_t end_bit = begin_bit + (sizeof(StackOpT::stack_level) * bits_per_byte); - // The key-value store operation that makes sure that reads for stack level '0' will be populated + // The stack operation that makes sure that reads for stack level '0' will be populated // with the empty_stack_symbol - KeyValueOpT const empty_stack{0, empty_stack_symbol}; + StackOpT const empty_stack{0, empty_stack_symbol}; - cub::TransformInputIterator, KeyValueOpT*> - kv_ops_scan_in(nullptr, detail::RemapEmptyStack{empty_stack}); - KeyValueOpT* kv_ops_scan_out = nullptr; + cub::TransformInputIterator, StackOpT*> + kv_ops_scan_in(nullptr, detail::RemapEmptyStack{empty_stack}); + StackOpT* kv_ops_scan_out = nullptr; std::size_t stack_level_scan_bytes = 0; std::size_t stack_level_sort_bytes = 0; @@ -366,7 +352,7 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, stack_level_scan_bytes, stack_symbols_in, d_kv_operations.Current(), - detail::AddStackLevelFromKVOp{}, + detail::AddStackLevelFromStackOp{}, num_symbols_in, stream)); @@ -388,7 +374,7 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, match_level_scan_bytes, kv_ops_scan_in, kv_ops_scan_out, - detail::PopulatePopWithPush{symbol_to_stack_op}, + detail::PopulatePopWithPush{symbol_to_stack_op}, num_symbols_in, stream)); @@ -416,8 +402,8 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, total_temp_storage_bytes = temp_storage.size(); rmm::device_uvector d_symbol_position_alt{num_symbols_in, stream}; - rmm::device_uvector d_kv_ops_current{num_symbols_in, stream}; - rmm::device_uvector d_kv_ops_alt{num_symbols_in, stream}; + rmm::device_uvector d_kv_ops_current{num_symbols_in, stream}; + rmm::device_uvector d_kv_ops_alt{num_symbols_in, stream}; //------------------------------------------------------------------------------ // ALGORITHM @@ -427,29 +413,29 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, cub::DoubleBuffer{d_symbol_positions, d_symbol_position_alt.data()}; // Initialize double-buffer for sorting the indexes of the sequence of sparse stack operations - d_kv_operations = cub::DoubleBuffer{d_kv_ops_current.data(), d_kv_ops_alt.data()}; + d_kv_operations = cub::DoubleBuffer{d_kv_ops_current.data(), d_kv_ops_alt.data()}; // Compute prefix sum of the stack level after each operation CUDA_TRY(cub::DeviceScan::InclusiveScan(temp_storage.data(), total_temp_storage_bytes, stack_symbols_in, d_kv_operations.Current(), - detail::AddStackLevelFromKVOp{}, + detail::AddStackLevelFromStackOp{}, num_symbols_in, stream)); // Dump info on stack operations: (stack level change + symbol) -> (absolute stack level + symbol) test::print::print_array(num_symbols_in, stream, - get_key_it(stack_symbols_in), + get_stack_level_it(stack_symbols_in), get_value_it(stack_symbols_in), - get_key_it(d_kv_operations.Current()), + get_stack_level_it(d_kv_operations.Current()), get_value_it(d_kv_operations.Current())); // Stable radix sort, sorting by stack level of the operations d_kv_operations_unsigned = - cub::DoubleBuffer{reinterpret_cast(d_kv_operations.Current()), - reinterpret_cast(d_kv_operations.Alternate())}; + cub::DoubleBuffer{reinterpret_cast(d_kv_operations.Current()), + reinterpret_cast(d_kv_operations.Alternate())}; CUDA_TRY(cub::DeviceRadixSort::SortPairs(temp_storage.data(), total_temp_storage_bytes, d_kv_operations_unsigned, @@ -460,22 +446,22 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, stream)); // TransformInputIterator that remaps all operations on stack level 0 to the empty stack symbol - kv_ops_scan_in = {reinterpret_cast(d_kv_operations_unsigned.Current()), - detail::RemapEmptyStack{empty_stack}}; - kv_ops_scan_out = reinterpret_cast(d_kv_operations_unsigned.Alternate()); + kv_ops_scan_in = {reinterpret_cast(d_kv_operations_unsigned.Current()), + detail::RemapEmptyStack{empty_stack}}; + kv_ops_scan_out = reinterpret_cast(d_kv_operations_unsigned.Alternate()); // Dump info on stack operations sorted by their stack level (i.e. stack level after applying // operation) test::print::print_array( - num_symbols_in, stream, get_key_it(kv_ops_scan_in), get_value_it(kv_ops_scan_in)); + num_symbols_in, stream, get_stack_level_it(kv_ops_scan_in), get_value_it(kv_ops_scan_in)); - // Exclusive scan to match pop operations with the latest push operation of that level + // Inclusive scan to match pop operations with the latest push operation of that level CUDA_TRY(cub::DeviceScan::InclusiveScan( temp_storage.data(), total_temp_storage_bytes, kv_ops_scan_in, kv_ops_scan_out, - detail::PopulatePopWithPush{symbol_to_stack_op}, + detail::PopulatePopWithPush{symbol_to_stack_op}, num_symbols_in, stream)); @@ -483,9 +469,9 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, // operation) test::print::print_array(num_symbols_in, stream, - get_key_it(kv_ops_scan_in), + get_stack_level_it(kv_ops_scan_in), get_value_it(kv_ops_scan_in), - get_key_it(kv_ops_scan_out), + get_stack_level_it(kv_ops_scan_out), get_value_it(kv_ops_scan_out)); // Fill the output tape with read-symbol @@ -494,9 +480,9 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, thrust::device_ptr{d_top_of_stack + num_symbols_out}, read_symbol); - // Transform the key-value operations to the stack symbol they represent - cub::TransformInputIterator - kv_op_to_stack_sym_it(kv_ops_scan_out, detail::KVOpToStackSymbol{}); + // Transform the stack operations to the stack symbol they represent + cub::TransformInputIterator + kv_op_to_stack_sym_it(kv_ops_scan_out, detail::StackOpToStackSymbol{}); // Scatter the stack symbols to the output tape (spots that are not scattered to have been // pre-filled with the read-symbol) From ca5d46524962d503c207faf5f7acc90d43d0b6ec Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 4 Apr 2022 07:35:33 -0700 Subject: [PATCH 010/173] device_span --- cpp/include/cudf_test/print_utilities.cuh | 6 +-- cpp/src/io/fst/logical_stack.cuh | 56 ++++++++++++----------- cpp/tests/io/fst/logical_stack_test.cu | 23 +++++----- 3 files changed, 44 insertions(+), 41 deletions(-) diff --git a/cpp/include/cudf_test/print_utilities.cuh b/cpp/include/cudf_test/print_utilities.cuh index 1da7b9836b1..5c5a42249ac 100644 --- a/cpp/include/cudf_test/print_utilities.cuh +++ b/cpp/include/cudf_test/print_utilities.cuh @@ -42,7 +42,7 @@ using hex_t = TaggedType; * @brief Function object to transform a built-in type to a tagged type (e.g., in order to print * values from an iterator returning uint32_t as hex values) * - * @tparam TaggedTypeT A TaggedType template specialisation + * @tparam TaggedTypeT A TaggedType template specialisation */ template struct ToTaggedType { @@ -55,10 +55,10 @@ struct ToTaggedType { /** * @brief Returns an iterator that causes the values from \p it to be printed as hex values. - * + * * @tparam InItT A random-access input iterator type * @param it A random-access input iterator t - * @return + * @return */ template auto hex(InItT it) diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index bce362beff9..3584f6665c4 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -17,10 +17,11 @@ #include #include +#include #include -#include #include +#include #include #include @@ -51,11 +52,11 @@ namespace detail { * @brief A convenience struct that represents a stack opepration as a pair, where the stack_level * represents the stack's level and the value represents the stack symbol. * - * @tparam StackLevelT The stack level type sufficient to cover all stack levels. Must be signed type as any - * subsequence of stack operations must be able to be covered. E.g., consider the first 10 - * operations are all push and the last 10 operations are all pop operations, we need to be able to - * represent a partial aggregate of the first ten items, which is '+10', just as well as a partial - * aggregate of the last ten items, which is '-10'. + * @tparam StackLevelT The stack level type sufficient to cover all stack levels. Must be signed + * type as any subsequence of stack operations must be able to be covered. E.g., consider the first + * 10 operations are all push and the last 10 operations are all pop operations, we need to be able + * to represent a partial aggregate of the first ten items, which is '+10', just as well as a + * partial aggregate of the last ten items, which is '-10'. * @tparam ValueT The value type that corresponds to the stack symbols (i.e., covers the stack * alphabet). */ @@ -143,10 +144,10 @@ struct AddStackLevelFromStackOp { }; /** - * @brief Binary reduction operator that propagates a write operation for a specific stack level to all - * reads of that same stack level. That is, if the stack level of LHS compares equal to the stack level of the RHS and if - * the RHS is a read and the LHS is a write operation type, then we return LHS, otherwise we return - * the RHS. + * @brief Binary reduction operator that propagates a write operation for a specific stack level to + * all reads of that same stack level. That is, if the stack level of LHS compares equal to the + * stack level of the RHS and if the RHS is a read and the LHS is a write operation type, then we + * return LHS, otherwise we return the RHS. */ template struct PopulatePopWithPush { @@ -159,8 +160,8 @@ struct PopulatePopWithPush { // Whether LHS is a matching write (i.e., the push operation that is on top of the stack for the // RHS's read) - bool is_lhs_matching_write = - (lhs.stack_level == rhs.stack_level) && symbol_to_stack_op_type(lhs.value) == stack_op_type::PUSH; + bool is_lhs_matching_write = (lhs.stack_level == rhs.stack_level) && + symbol_to_stack_op_type(lhs.value) == stack_op_type::PUSH; return (is_rhs_read && is_lhs_matching_write) ? lhs : rhs; } @@ -277,7 +278,6 @@ auto get_value_it(StackOpItT it) * was empty * @param[in] read_symbol A symbol that may not be confused for a symbol that would push to the * stack - * @param[in] num_symbols_in The number of symbols in the sparse representation * @param[in] num_symbols_out The number of symbols that are supposed to be filled with * what-is-on-top-of-the-stack * @param[in] stream The cuda stream to which to dispatch the work @@ -287,17 +287,15 @@ template + typename StackSymbolT> void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, StackSymbolItT d_symbols, - SymbolPositionT* d_symbol_positions, + device_span d_symbol_positions, StackSymbolToStackOpTypeT symbol_to_stack_op, TopOfStackOutItT d_top_of_stack, StackSymbolT empty_stack_symbol, StackSymbolT read_symbol, - OffsetT num_symbols_in, - OffsetT num_symbols_out, + std::size_t num_symbols_out, rmm::cuda_stream_view stream = rmm::cuda_stream_default) { // Type used to hold pairs of (stack_level, value) pairs @@ -313,6 +311,8 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, using TransformInputItT = cub::TransformInputIterator; + auto const num_symbols_in = d_symbol_positions.size(); + // Converting a stack symbol that may either push or pop to a stack operation: // stack_symbol -> ([+1,0,-1], stack_symbol) StackSymbolToStackOpT stack_sym_to_kv_op{symbol_to_stack_op}; @@ -322,7 +322,8 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, // representation) cub::DoubleBuffer d_symbol_positions_db{nullptr, nullptr}; - // Double-buffer for sorting the stack operations by the stack level to which such operation applies + // Double-buffer for sorting the stack operations by the stack level to which such operation + // applies cub::DoubleBuffer d_kv_operations{nullptr, nullptr}; // A double-buffer that aliases memory from d_kv_operations with unsigned types in order to @@ -391,14 +392,15 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, // Scratch memory required by the algorithms auto total_temp_storage_bytes = std::max({stack_level_scan_bytes, - stack_level_sort_bytes, - match_level_scan_bytes, - propagate_writes_scan_bytes}); + stack_level_sort_bytes, + match_level_scan_bytes, + propagate_writes_scan_bytes}); if (temp_storage.size() < total_temp_storage_bytes) { temp_storage.resize(total_temp_storage_bytes, stream); } - // Actual device buffer size, as we need to pass in an lvalue-ref to cub algorithms as temp_storage_bytes + // Actual device buffer size, as we need to pass in an lvalue-ref to cub algorithms as + // temp_storage_bytes total_temp_storage_bytes = temp_storage.size(); rmm::device_uvector d_symbol_position_alt{num_symbols_in, stream}; @@ -410,7 +412,7 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, //------------------------------------------------------------------------------ // Initialize double-buffer for sorting the indexes of the sequence of sparse stack operations d_symbol_positions_db = - cub::DoubleBuffer{d_symbol_positions, d_symbol_position_alt.data()}; + cub::DoubleBuffer{d_symbol_positions.data(), d_symbol_position_alt.data()}; // Initialize double-buffer for sorting the indexes of the sequence of sparse stack operations d_kv_operations = cub::DoubleBuffer{d_kv_ops_current.data(), d_kv_ops_alt.data()}; @@ -433,9 +435,9 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, get_value_it(d_kv_operations.Current())); // Stable radix sort, sorting by stack level of the operations - d_kv_operations_unsigned = - cub::DoubleBuffer{reinterpret_cast(d_kv_operations.Current()), - reinterpret_cast(d_kv_operations.Alternate())}; + d_kv_operations_unsigned = cub::DoubleBuffer{ + reinterpret_cast(d_kv_operations.Current()), + reinterpret_cast(d_kv_operations.Alternate())}; CUDA_TRY(cub::DeviceRadixSort::SortPairs(temp_storage.data(), total_temp_storage_bytes, d_kv_operations_unsigned, diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu index 3b860867cf2..7f7d72c0db3 100644 --- a/cpp/tests/io/fst/logical_stack_test.cu +++ b/cpp/tests/io/fst/logical_stack_test.cu @@ -22,8 +22,8 @@ #include #include -#include #include +#include #include #include @@ -202,6 +202,7 @@ TEST_F(LogicalStackTest, GroundTruth) rmm::device_uvector d_stack_ops(stack_symbols.size(), stream_view); rmm::device_uvector d_stack_op_indexes(stack_op_indexes.size(), stream_view); auto top_of_stack_gpu = hostdevice_vector(input.size(), stream_view); + cudf::device_span d_stack_op_idx_span{d_stack_op_indexes.data(), d_stack_op_indexes.size()}; cudaMemcpyAsync(d_stack_ops.data(), stack_symbols.data(), @@ -224,16 +225,16 @@ TEST_F(LogicalStackTest, GroundTruth) rmm::device_buffer d_temp_storage{}; // Run algorithm - fst::SparseStackOpToTopOfStack(d_temp_storage, - d_stack_ops.data(), - d_stack_op_indexes.data(), - JSONToStackOp{}, - top_of_stack_gpu.device_ptr(), - empty_stack_symbol, - read_symbol, - num_stack_ops, - string_size, - stream); + fst::SparseStackOpToTopOfStack( + d_temp_storage, + d_stack_ops.data(), + d_stack_op_idx_span, + JSONToStackOp{}, + top_of_stack_gpu.device_ptr(), + empty_stack_symbol, + read_symbol, + string_size, + stream); // Async copy results from device to host top_of_stack_gpu.device_to_host(stream_view); From f5960bd9f7c38cfe43eaadf38a48676a425039cc Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 6 Apr 2022 07:44:44 -0700 Subject: [PATCH 011/173] addressing review comments & minor cleanups --- cpp/src/io/fst/logical_stack.cuh | 5 --- cpp/tests/io/fst/logical_stack_test.cu | 58 +++++++++++++------------- 2 files changed, 29 insertions(+), 34 deletions(-) diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index 3584f6665c4..93f1a9ac09f 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -75,11 +75,6 @@ template struct StackOpToUnsigned { }; -template <> -struct StackOpToUnsigned<1U> { - using UnsignedT = uint8_t; -}; - template <> struct StackOpToUnsigned<2U> { using UnsignedT = uint16_t; diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu index 7f7d72c0db3..6f0535a6c77 100644 --- a/cpp/tests/io/fst/logical_stack_test.cu +++ b/cpp/tests/io/fst/logical_stack_test.cu @@ -78,7 +78,7 @@ std::pair to_sparse_stack_symbols( /** * @brief Reads in a sequence of items that represent stack operations, applies these operations to - * a stack, and, for every oepration being read in, outputs what was the symbol on top of the stack + * a stack, and, for every operation being read in, outputs what was the symbol on top of the stack * before the operations was applied. In case the stack is empty before any operation, * \p empty_stack will be output instead. * @@ -91,7 +91,8 @@ std::pair to_sparse_stack_symbols( * @param[in] end Iterator to one past the last item representing the stack operation * @param[in] to_stack_op A function object that takes an instance of InputItT's value type and * returns the kind of stack operation such item represents (i.e., of type stack_op_type) - * @param[in] empty_stack A symbol that will be written to top_of_stack whenever the stack was empty + * @param[in] empty_stack A symbol that will be written to top_of_stack_out_it whenever the stack + * was empty * @param[out] top_of_stack The output iterator to which the item will be written to * @return TopOfStackOutItT Iterators to one past the last element that was written */ @@ -103,13 +104,15 @@ TopOfStackOutItT to_top_of_stack(InputItT begin, InputItT end, ToStackOpTypeT to_stack_op, StackSymbolT empty_stack, - TopOfStackOutItT top_of_stack) + TopOfStackOutItT top_of_stack_out_it) { - std::stack stack; + // This is the data structure that keeps track of the full stack state for each input symbol + std::stack stack_state; + for (auto it = begin; it < end; it++) { // Write what is currently on top of the stack when reading in the current symbol - *top_of_stack = stack.empty() ? empty_stack : stack.top(); - top_of_stack++; + *top_of_stack_out_it = stack_state.empty() ? empty_stack : stack_state.top(); + top_of_stack_out_it++; auto const& current = *it; fst::stack_op_type op_type = to_stack_op(current); @@ -117,12 +120,12 @@ TopOfStackOutItT to_top_of_stack(InputItT begin, // Check whether this symbol corresponds to a push or pop operation and modify the stack // accordingly if (op_type == fst::stack_op_type::PUSH) { - stack.push(current); + stack_state.push(current); } else if (op_type == fst::stack_op_type::POP) { - stack.pop(); + stack_state.pop(); } } - return top_of_stack; + return top_of_stack_out_it; } /** @@ -155,8 +158,7 @@ TEST_F(LogicalStackTest, GroundTruth) // The stack symbol that we'll fill everywhere where there's nothing on the stack constexpr SymbolT empty_stack_symbol = '_'; - // This just has to be a stack symbol that may not be confused with a symbol that would push or - // pop + // This just has to be a stack symbol that may not be confused with a symbol that would push constexpr SymbolT read_symbol = 'x'; // Prepare cuda stream for data transfers & kernels @@ -185,7 +187,7 @@ TEST_F(LogicalStackTest, GroundTruth) input += input; // Getting the symbols that actually modify the stack (i.e., symbols that push or pop) - std::string stack_symbols = ""; + std::string stack_symbols{}; std::vector stack_op_indexes; stack_op_indexes.reserve(input.size()); @@ -196,13 +198,11 @@ TEST_F(LogicalStackTest, GroundTruth) std::back_inserter(stack_symbols), std::back_inserter(stack_op_indexes)); - // Prepare sparse stack ops - std::size_t num_stack_ops = stack_symbols.size(); - - rmm::device_uvector d_stack_ops(stack_symbols.size(), stream_view); - rmm::device_uvector d_stack_op_indexes(stack_op_indexes.size(), stream_view); - auto top_of_stack_gpu = hostdevice_vector(input.size(), stream_view); - cudf::device_span d_stack_op_idx_span{d_stack_op_indexes.data(), d_stack_op_indexes.size()}; + rmm::device_uvector d_stack_ops{stack_symbols.size(), stream_view}; + rmm::device_uvector d_stack_op_indexes{stack_op_indexes.size(), stream_view}; + hostdevice_vector top_of_stack_gpu{input.size(), stream_view}; + cudf::device_span d_stack_op_idx_span{d_stack_op_indexes.data(), + d_stack_op_indexes.size()}; cudaMemcpyAsync(d_stack_ops.data(), stack_symbols.data(), @@ -225,16 +225,15 @@ TEST_F(LogicalStackTest, GroundTruth) rmm::device_buffer d_temp_storage{}; // Run algorithm - fst::SparseStackOpToTopOfStack( - d_temp_storage, - d_stack_ops.data(), - d_stack_op_idx_span, - JSONToStackOp{}, - top_of_stack_gpu.device_ptr(), - empty_stack_symbol, - read_symbol, - string_size, - stream); + fst::SparseStackOpToTopOfStack(d_temp_storage, + d_stack_ops.data(), + d_stack_op_idx_span, + JSONToStackOp{}, + top_of_stack_gpu.device_ptr(), + empty_stack_symbol, + read_symbol, + string_size, + stream); // Async copy results from device to host top_of_stack_gpu.device_to_host(stream_view); @@ -253,6 +252,7 @@ TEST_F(LogicalStackTest, GroundTruth) // Verify results ASSERT_EQ(input.size(), top_of_stack_cpu.size()); + ASSERT_EQ(top_of_stack_gpu.size(), top_of_stack_cpu.size()); for (size_t i = 0; i < input.size() && i < top_of_stack_cpu.size(); i++) { ASSERT_EQ(top_of_stack_gpu.host_ptr()[i], top_of_stack_cpu[i]) << "Mismatch at index #" << i; } From 80226b76011518ee3f713d3b4a059cc1bb1b49a4 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 6 Apr 2022 22:19:03 -0700 Subject: [PATCH 012/173] error on unsupported unsigned_t and fixed typos --- cpp/src/io/fst/logical_stack.cuh | 10 ++++++---- cpp/tests/io/fst/logical_stack_test.cu | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index 93f1a9ac09f..d84a8c8fc80 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -49,7 +49,7 @@ enum class stack_op_type : int32_t { namespace detail { /** - * @brief A convenience struct that represents a stack opepration as a pair, where the stack_level + * @brief A convenience struct that represents a stack operation as a pair, where the stack_level * represents the stack's level and the value represents the stack symbol. * * @tparam StackLevelT The stack level type sufficient to cover all stack levels. Must be signed @@ -73,6 +73,7 @@ struct StackOp { */ template struct StackOpToUnsigned { + using UnsignedT = void; }; template <> @@ -166,8 +167,8 @@ struct PopulatePopWithPush { }; /** - * @brief Binary reduction operator that is used to replace each read_symbol occurance with the last - * non-read_symbol that precedes such read_symbol. + * @brief Binary reduction operator that is used to replace each read_symbol occurrence with the + * last non-read_symbol that precedes such read_symbol. */ template struct PropagateLastWrite { @@ -298,6 +299,7 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, // The unsigned integer type that we use for radix sorting items of type StackOpT using StackOpUnsignedT = detail::UnsignedStackOpType; + static_assert(!std::is_void(), "unsupported StackOpT size"); // Transforming sequence of stack symbols to stack operations using StackSymbolToStackOpT = detail::StackSymbolToStackOp; @@ -495,7 +497,7 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, std::min(num_symbols_in, static_cast(10000)), stream, d_top_of_stack); // We perform an exclusive scan in order to fill the items at the very left that may - // be reading the empty stack before there's the first push occurance in the sequence. + // be reading the empty stack before there's the first push occurrence in the sequence. // Also, we're interested in the top-of-the-stack symbol before the operation was applied. CUDA_TRY(cub::DeviceScan::ExclusiveScan(temp_storage.data(), total_temp_storage_bytes, diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu index 6f0535a6c77..f690a8497df 100644 --- a/cpp/tests/io/fst/logical_stack_test.cu +++ b/cpp/tests/io/fst/logical_stack_test.cu @@ -84,7 +84,7 @@ std::pair to_sparse_stack_symbols( * * @tparam InputItT Forward input iterator type to items representing stack operations * @tparam ToStackOpTypeT A transform function object class that maps an item representing a stack - * oepration to the stack_op_type of such item + * operation to the stack_op_type of such item * @tparam StackSymbolT Type representing items being pushed onto the stack * @tparam TopOfStackOutItT A forward output iterator type being assigned items of StackSymbolT * @param[in] begin Forward iterator to the beginning of the items representing stack operations @@ -129,7 +129,7 @@ TopOfStackOutItT to_top_of_stack(InputItT begin, } /** - * @brief Funciton object used to filter for brackets and braces that represent push and pop + * @brief Function object used to filter for brackets and braces that represent push and pop * operations * */ From e8bc8a5a6f882d69f7bc168d37cd559f2d720abe Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 12 Apr 2022 22:55:00 -0700 Subject: [PATCH 013/173] minor style changes addressing review comments --- cpp/src/io/fst/logical_stack.cuh | 143 +++++++++++++------------ cpp/tests/io/fst/logical_stack_test.cu | 48 ++++----- 2 files changed, 96 insertions(+), 95 deletions(-) diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index d84a8c8fc80..412e85204fe 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -227,7 +227,7 @@ struct StackOpToStackLevel { * @brief Retrieves an iterator that returns only the `stack_level` part from a StackOp iterator. */ template -auto get_stack_level_it(StackOpItT it) +auto get_stack_level_iterator(StackOpItT it) { return thrust::make_transform_iterator(it, StackOpToStackLevel{}); } @@ -236,7 +236,7 @@ auto get_stack_level_it(StackOpItT it) * @brief Retrieves an iterator that returns only the `value` part from a StackOp iterator. */ template -auto get_value_it(StackOpItT it) +auto get_value_iterator(StackOpItT it) { return thrust::make_transform_iterator(it, StackOpToStackSymbol{}); } @@ -284,16 +284,17 @@ template -void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, - StackSymbolItT d_symbols, - device_span d_symbol_positions, - StackSymbolToStackOpTypeT symbol_to_stack_op, - TopOfStackOutItT d_top_of_stack, - StackSymbolT empty_stack_symbol, - StackSymbolT read_symbol, - std::size_t num_symbols_out, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) +void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, + device_span d_symbol_positions, + StackSymbolToStackOpTypeT symbol_to_stack_op, + TopOfStackOutItT d_top_of_stack, + StackSymbolT const empty_stack_symbol, + StackSymbolT const read_symbol, + std::size_t const num_symbols_out, + rmm::cuda_stream_view stream = rmm::cuda_stream_default) { + rmm::device_buffer temp_storage{}; + // Type used to hold pairs of (stack_level, value) pairs using StackOpT = detail::StackOp; @@ -346,28 +347,28 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, // Getting temporary storage requirements for the prefix sum of the stack level after each // operation - CUDA_TRY(cub::DeviceScan::InclusiveScan(nullptr, - stack_level_scan_bytes, - stack_symbols_in, - d_kv_operations.Current(), - detail::AddStackLevelFromStackOp{}, - num_symbols_in, - stream)); + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(nullptr, + stack_level_scan_bytes, + stack_symbols_in, + d_kv_operations.Current(), + detail::AddStackLevelFromStackOp{}, + num_symbols_in, + stream)); // Getting temporary storage requirements for the stable radix sort (sorting by stack level of the // operations) - CUDA_TRY(cub::DeviceRadixSort::SortPairs(nullptr, - stack_level_sort_bytes, - d_kv_operations_unsigned, - d_symbol_positions_db, - num_symbols_in, - begin_bit, - end_bit, - stream)); + CUDF_CUDA_TRY(cub::DeviceRadixSort::SortPairs(nullptr, + stack_level_sort_bytes, + d_kv_operations_unsigned, + d_symbol_positions_db, + num_symbols_in, + begin_bit, + end_bit, + stream)); // Getting temporary storage requirements for the scan to match pop operations with the latest // push of the same level - CUDA_TRY(cub::DeviceScan::InclusiveScan( + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan( nullptr, match_level_scan_bytes, kv_ops_scan_in, @@ -378,14 +379,15 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, // Getting temporary storage requirements for the scan to propagate top-of-stack for spots that // didn't push or pop - CUDA_TRY(cub::DeviceScan::ExclusiveScan(nullptr, - propagate_writes_scan_bytes, - d_top_of_stack, - d_top_of_stack, - detail::PropagateLastWrite{read_symbol}, - empty_stack_symbol, - num_symbols_out, - stream)); + CUDF_CUDA_TRY( + cub::DeviceScan::ExclusiveScan(nullptr, + propagate_writes_scan_bytes, + d_top_of_stack, + d_top_of_stack, + detail::PropagateLastWrite{read_symbol}, + empty_stack_symbol, + num_symbols_out, + stream)); // Scratch memory required by the algorithms auto total_temp_storage_bytes = std::max({stack_level_scan_bytes, @@ -415,34 +417,34 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, d_kv_operations = cub::DoubleBuffer{d_kv_ops_current.data(), d_kv_ops_alt.data()}; // Compute prefix sum of the stack level after each operation - CUDA_TRY(cub::DeviceScan::InclusiveScan(temp_storage.data(), - total_temp_storage_bytes, - stack_symbols_in, - d_kv_operations.Current(), - detail::AddStackLevelFromStackOp{}, - num_symbols_in, - stream)); + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(temp_storage.data(), + total_temp_storage_bytes, + stack_symbols_in, + d_kv_operations.Current(), + detail::AddStackLevelFromStackOp{}, + num_symbols_in, + stream)); // Dump info on stack operations: (stack level change + symbol) -> (absolute stack level + symbol) test::print::print_array(num_symbols_in, stream, - get_stack_level_it(stack_symbols_in), - get_value_it(stack_symbols_in), - get_stack_level_it(d_kv_operations.Current()), - get_value_it(d_kv_operations.Current())); + get_stack_level_iterator(stack_symbols_in), + get_value_iterator(stack_symbols_in), + get_stack_level_iterator(d_kv_operations.Current()), + get_value_iterator(d_kv_operations.Current())); // Stable radix sort, sorting by stack level of the operations d_kv_operations_unsigned = cub::DoubleBuffer{ reinterpret_cast(d_kv_operations.Current()), reinterpret_cast(d_kv_operations.Alternate())}; - CUDA_TRY(cub::DeviceRadixSort::SortPairs(temp_storage.data(), - total_temp_storage_bytes, - d_kv_operations_unsigned, - d_symbol_positions_db, - num_symbols_in, - begin_bit, - end_bit, - stream)); + CUDF_CUDA_TRY(cub::DeviceRadixSort::SortPairs(temp_storage.data(), + total_temp_storage_bytes, + d_kv_operations_unsigned, + d_symbol_positions_db, + num_symbols_in, + begin_bit, + end_bit, + stream)); // TransformInputIterator that remaps all operations on stack level 0 to the empty stack symbol kv_ops_scan_in = {reinterpret_cast(d_kv_operations_unsigned.Current()), @@ -451,11 +453,13 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, // Dump info on stack operations sorted by their stack level (i.e. stack level after applying // operation) - test::print::print_array( - num_symbols_in, stream, get_stack_level_it(kv_ops_scan_in), get_value_it(kv_ops_scan_in)); + test::print::print_array(num_symbols_in, + stream, + get_stack_level_iterator(kv_ops_scan_in), + get_value_iterator(kv_ops_scan_in)); // Inclusive scan to match pop operations with the latest push operation of that level - CUDA_TRY(cub::DeviceScan::InclusiveScan( + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan( temp_storage.data(), total_temp_storage_bytes, kv_ops_scan_in, @@ -468,10 +472,10 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, // operation) test::print::print_array(num_symbols_in, stream, - get_stack_level_it(kv_ops_scan_in), - get_value_it(kv_ops_scan_in), - get_stack_level_it(kv_ops_scan_out), - get_value_it(kv_ops_scan_out)); + get_stack_level_iterator(kv_ops_scan_in), + get_value_iterator(kv_ops_scan_in), + get_stack_level_iterator(kv_ops_scan_out), + get_value_iterator(kv_ops_scan_out)); // Fill the output tape with read-symbol thrust::fill(thrust::cuda::par.on(stream), @@ -499,14 +503,15 @@ void SparseStackOpToTopOfStack(rmm::device_buffer& temp_storage, // We perform an exclusive scan in order to fill the items at the very left that may // be reading the empty stack before there's the first push occurrence in the sequence. // Also, we're interested in the top-of-the-stack symbol before the operation was applied. - CUDA_TRY(cub::DeviceScan::ExclusiveScan(temp_storage.data(), - total_temp_storage_bytes, - d_top_of_stack, - d_top_of_stack, - detail::PropagateLastWrite{read_symbol}, - empty_stack_symbol, - num_symbols_out, - stream)); + CUDF_CUDA_TRY( + cub::DeviceScan::ExclusiveScan(temp_storage.data(), + total_temp_storage_bytes, + d_top_of_stack, + d_top_of_stack, + detail::PropagateLastWrite{read_symbol}, + empty_stack_symbol, + num_symbols_out, + stream)); // Dump the final output test::print::print_array( diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu index f690a8497df..87a5bb69b2c 100644 --- a/cpp/tests/io/fst/logical_stack_test.cu +++ b/cpp/tests/io/fst/logical_stack_test.cu @@ -167,20 +167,20 @@ TEST_F(LogicalStackTest, GroundTruth) rmm::cuda_stream_view stream_view(stream); // Test input, - std::string input = R"( { -"category": "reference", -"index:" [4,12,42], -"author": "Nigel Rees", -"title": "Sayings of the Century", -"price": 8.95 -} -{ -"category": "reference", -"index:" [4,{},null,{"a":[]}], -"author": "Nigel Rees", -"title": "Sayings of the Century", -"price": 8.95 -} {} [] [ ])"; + std::string input = R"( {)" + R"(category": "reference",)" + R"("index:" [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "Sayings of the Century",)" + R"("price": 8.95)" + R"(} )" + R"({)" + R"("category": "reference",)" + R"("index:" [4,{},null,{"a":[]}],)" + R"("author": "Nigel Rees",)" + R"("title": "Sayings of the Century",)" + R"("price": 8.95)" + R"(} {} [] [ ])"; // Repeat input sample 1024x for (std::size_t i = 0; i < 10; i++) @@ -221,19 +221,15 @@ TEST_F(LogicalStackTest, GroundTruth) SymbolT* d_top_of_stack = nullptr; cudaMalloc(&d_top_of_stack, string_size); - // Allocate temporary storage required by the get-top-of-the-stack algorithm - rmm::device_buffer d_temp_storage{}; - // Run algorithm - fst::SparseStackOpToTopOfStack(d_temp_storage, - d_stack_ops.data(), - d_stack_op_idx_span, - JSONToStackOp{}, - top_of_stack_gpu.device_ptr(), - empty_stack_symbol, - read_symbol, - string_size, - stream); + fst::sparse_stack_op_to_top_of_stack(d_stack_ops.data(), + d_stack_op_idx_span, + JSONToStackOp{}, + top_of_stack_gpu.device_ptr(), + empty_stack_symbol, + read_symbol, + string_size, + stream); // Async copy results from device to host top_of_stack_gpu.device_to_host(stream_view); From c5274b50af7ca2e6f7d288c49d85a0f5c32e8f53 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 11 Apr 2022 12:17:55 -0700 Subject: [PATCH 014/173] squashed with bracket/brace test --- cpp/src/io/fst/agent_dfa.cuh | 722 +++++++++++++++++++++++++++ cpp/src/io/fst/device_dfa.cuh | 264 ++++++++++ cpp/src/io/fst/dispatch_dfa.cuh | 462 +++++++++++++++++ cpp/src/io/fst/in_reg_array.cuh | 138 +++++ cpp/src/io/fst/symbol_lut.cuh | 182 +++++++ cpp/src/io/fst/transition_table.cuh | 149 ++++++ cpp/src/io/fst/translation_table.cuh | 200 ++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/io/fst/fst_test.cu | 291 +++++++++++ 9 files changed, 2409 insertions(+) create mode 100644 cpp/src/io/fst/agent_dfa.cuh create mode 100644 cpp/src/io/fst/device_dfa.cuh create mode 100644 cpp/src/io/fst/dispatch_dfa.cuh create mode 100644 cpp/src/io/fst/in_reg_array.cuh create mode 100644 cpp/src/io/fst/symbol_lut.cuh create mode 100644 cpp/src/io/fst/transition_table.cuh create mode 100644 cpp/src/io/fst/translation_table.cuh create mode 100644 cpp/tests/io/fst/fst_test.cu diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh new file mode 100644 index 00000000000..d983f9287a9 --- /dev/null +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -0,0 +1,722 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "in_reg_array.cuh" + +#include + +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { + +//----------------------------------------------------------------------------- +// STATE VECTOR +//----------------------------------------------------------------------------- +/** + * @brief A vector is able to hold multiple state indices (e.g., to represent multiple DFA + * instances, where the i-th item would represent the i-th DFA instance). + * + * @tparam StateIndexT Signed or unsigned type used to index items inside the vector + * @tparam NUM_ITEMS The number of items to be allocated for a vector + */ +template +class MultiItemStateVector { + public: + template + constexpr CUDF_HOST_DEVICE void Set(IndexT index, StateIndexT value) noexcept + { + state_[index] = value; + } + + template + constexpr CUDF_HOST_DEVICE StateIndexT Get(IndexT index) const noexcept + { + return state_[index]; + } + + private: + StateIndexT state_[NUM_ITEMS]; +}; + +//----------------------------------------------------------------------------- +// DFA-SIMULATION STATE COMPOSITION FUNCTORS +//----------------------------------------------------------------------------- +/** + * @brief Implements an associative composition operation for state transition vectors and + * offset-to-overap vectors to be used with a prefix scan. + * + * l r = c ( s->l->r) + * 0: [2] [1] [2] (i.e. 0->2->2) + * 1: [1] [2] [2] (i.e. 1->1->2) + * 2: [0] [2] [1] (i.e. 2->0->2) + * @tparam NUM_ITEMS The number of items stored within a vector + */ +template +struct VectorCompositeOp { + template + constexpr CUDF_HOST_DEVICE VectorT operator()(VectorT const& lhs, VectorT const& rhs) + { + VectorT res; + for (int32_t i = 0; i < NUM_ITEMS; ++i) { + res.Set(i, rhs.Get(lhs.Get(i))); + } + return res; + } +}; + +//----------------------------------------------------------------------------- +// DFA-SIMULATION CALLBACK WRAPPERS/HELPERS +//----------------------------------------------------------------------------- +template +class DFASimulationCallbackWrapper { + public: + __host__ __device__ __forceinline__ DFASimulationCallbackWrapper( + TransducerTableT transducer_table, TransducedOutItT out_it, TransducedIndexOutItT out_idx_it) + : transducer_table(transducer_table), out_it(out_it), out_idx_it(out_idx_it), write(false) + { + } + + template + __host__ __device__ __forceinline__ void Init(OffsetT const& offset) + { + this->offset = offset; + if (!write) out_count = 0; + } + + template + __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, + StateVectorT const& old_state, + StateVectorT const& new_state, + SymbolIndexT const& symbol_id) + { + uint32_t count = transducer_table(old_state.Get(0), symbol_id); + if (write) { + for (uint32_t out_char = 0; out_char < count; out_char++) { + out_it[out_count + out_char] = + transducer_table(old_state.Get(0), symbol_id, out_char); + out_idx_it[out_count + out_char] = offset + character_index; + } + } + out_count += count; + } + + __host__ __device__ __forceinline__ void TearDown() {} + + public: + TransducerTableT transducer_table; + TransducedOutItT out_it; + TransducedIndexOutItT out_idx_it; + uint32_t out_count; + uint32_t offset; + bool write; +}; + +//----------------------------------------------------------------------------- +// STATE-TRANSITION CALLBACKS +//----------------------------------------------------------------------------- +class StateTransitionCallbackOp { + public: + template + __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, + SymbolIndexT const& read_symbol_id) const + { + } +}; +/// Type alias for a state transition callback class that performs no operation on any callback +using NoOpStateTransitionOp = StateTransitionCallbackOp; + +template +class StateVectorTransitionOp : public StateTransitionCallbackOp { + public: + __host__ __device__ __forceinline__ + StateVectorTransitionOp(TransitionTableT const& transition_table, StateVectorT& state_vector) + : transition_table(transition_table), state_vector(state_vector) + { + } + + template + __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, + SymbolIndexT const read_symbol_id) const + { + using TransitionVectorT = typename TransitionTableT::TransitionVectorT; + + for (int32_t i = 0; i < NUM_INSTANCES; ++i) { + state_vector.Set(i, transition_table(state_vector.Get(i), read_symbol_id)); + } + } + + public: + StateVectorT& state_vector; + const TransitionTableT& transition_table; +}; + +template +struct StateTransitionOp { + StateVectorT old_state_vector; + StateVectorT state_vector; + const TransitionTableT& transition_table; + CallbackOpT& callback_op; + + __host__ __device__ __forceinline__ StateTransitionOp(const TransitionTableT& transition_table, + StateVectorT state_vector, + CallbackOpT& callback_op) + : transition_table(transition_table), + state_vector(state_vector), + old_state_vector(state_vector), + callback_op(callback_op) + { + } + + template + __host__ __device__ __forceinline__ void ReadSymbol(const CharIndexT& character_index, + const SymbolIndexT& read_symbol_id) + { + using TransitionVectorT= typename TransitionTableT::TransitionVectorT ; + old_state_vector = state_vector; + state_vector.Set(0, transition_table(state_vector.Get(0), read_symbol_id)); + callback_op.ReadSymbol(character_index, old_state_vector, state_vector, read_symbol_id); + } +}; + +template +struct AgentDFA { + using SymbolIndexT = uint32_t; + using StateIndexT = uint32_t; + using AliasedLoadT = uint32_t; + using CharT = typename std::iterator_traits::value_type; + + //------------------------------------------------------------------------------ + // DERIVED CONFIGS + //------------------------------------------------------------------------------ + static constexpr uint32_t BLOCK_THREADS = AgentDFAPolicy::BLOCK_THREADS; + static constexpr uint32_t ITEMS_PER_THREAD = AgentDFAPolicy::ITEMS_PER_THREAD; + + // The number of symbols per thread + static constexpr uint32_t SYMBOLS_PER_THREAD = ITEMS_PER_THREAD; + static constexpr uint32_t SYMBOLS_PER_BLOCK = BLOCK_THREADS * SYMBOLS_PER_THREAD; + + static constexpr uint32_t MIN_UINTS_PER_BLOCK = + CUB_QUOTIENT_CEILING(SYMBOLS_PER_BLOCK, sizeof(AliasedLoadT)); + static constexpr uint32_t UINTS_PER_THREAD = + CUB_QUOTIENT_CEILING(MIN_UINTS_PER_BLOCK, BLOCK_THREADS); + static constexpr uint32_t UINTS_PER_BLOCK = UINTS_PER_THREAD * BLOCK_THREADS; + static constexpr uint32_t SYMBOLS_PER_UINT_BLOCK = UINTS_PER_BLOCK * sizeof(AliasedLoadT); + + //------------------------------------------------------------------------------ + // TYPEDEFS + //------------------------------------------------------------------------------ + struct _TempStorage { + // For aliased loading of characters into shared memory + union { + CharT chars[SYMBOLS_PER_BLOCK]; + AliasedLoadT uints[UINTS_PER_BLOCK]; + }; + }; + + struct TempStorage : cub::Uninitialized<_TempStorage> { + }; + + //------------------------------------------------------------------------------ + // MEMBER VARIABLES + //------------------------------------------------------------------------------ + _TempStorage& temp_storage; + + //------------------------------------------------------------------------------ + // CONSTRUCTOR + //------------------------------------------------------------------------------ + __device__ __forceinline__ AgentDFA(TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()) + { + } + + //--------------------------------------------------------------------- + // STATIC PARSING PRIMITIVES + //--------------------------------------------------------------------- + template + __device__ __forceinline__ static void ThreadParse(const SymbolMatcherT& symbol_matcher, + const CharT* chars, + const SymbolIndexT& max_num_chars, + CallbackOpT callback_op, + cub::Int2Type /*IS_FULL_BLOCK*/) + { + uint32_t matched_id; + + // Iterate over symbols +#pragma unroll + for (int32_t i = 0; i < NUM_SYMBOLS; ++i) { + if (IS_FULL_BLOCK || threadIdx.x * SYMBOLS_PER_THREAD + i < max_num_chars) { + matched_id = symbol_matcher(chars[i]); + callback_op.ReadSymbol(i, matched_id); + } + } + } + + template + __device__ __forceinline__ void GetThreadStateTransitions( + const SymbolMatcherT& symbol_matcher, + const CharT* chars, + const SymbolIndexT& max_num_chars, + StateTransitionOpT& state_transition_op, + cub::Int2Type /*IS_FULL_BLOCK*/) + { + ThreadParse( + symbol_matcher, chars, max_num_chars, state_transition_op, cub::Int2Type()); + } + + //--------------------------------------------------------------------- + // LOADING FULL BLOCK OF CHARACTERS, NON-ALIASED + //--------------------------------------------------------------------- + __device__ __forceinline__ void LoadBlock(const CharT* d_chars, + const OffsetT block_offset, + const OffsetT num_total_symbols, + cub::Int2Type /*IS_FULL_BLOCK*/, + cub::Int2Type<1> /*ALIGNMENT*/) + { + CharT thread_chars[SYMBOLS_PER_THREAD]; + + const CharT* d_block_symbols = d_chars + block_offset; + cub::LoadDirectStriped(threadIdx.x, d_block_symbols, thread_chars); + +#pragma unroll + for (int32_t i = 0; i < SYMBOLS_PER_THREAD; ++i) { + temp_storage.chars[threadIdx.x + i * BLOCK_THREADS] = thread_chars[i]; + } + } + + //--------------------------------------------------------------------- + // LOADING PARTIAL BLOCK OF CHARACTERS, NON-ALIASED + //--------------------------------------------------------------------- + __device__ __forceinline__ void LoadBlock(const CharT* d_chars, + const OffsetT block_offset, + const OffsetT num_total_symbols, + cub::Int2Type /*IS_FULL_BLOCK*/, + cub::Int2Type<1> /*ALIGNMENT*/) + { + CharT thread_chars[SYMBOLS_PER_THREAD]; + + if (num_total_symbols <= block_offset) return; + + // Last unit to be loaded is IDIV_CEIL(#SYM, SYMBOLS_PER_UNIT) + OffsetT num_total_chars = num_total_symbols - block_offset; + + const CharT* d_block_symbols = d_chars + block_offset; + cub::LoadDirectStriped( + threadIdx.x, d_block_symbols, thread_chars, num_total_chars); + +#pragma unroll + for (int32_t i = 0; i < SYMBOLS_PER_THREAD; ++i) { + temp_storage.chars[threadIdx.x + i * BLOCK_THREADS] = thread_chars[i]; + } + } + + //--------------------------------------------------------------------- + // LOADING FULL BLOCK OF CHARACTERS, ALIASED + //--------------------------------------------------------------------- + __device__ __forceinline__ void LoadBlock(const CharT* d_chars, + const OffsetT block_offset, + const OffsetT num_total_symbols, + cub::Int2Type /*IS_FULL_BLOCK*/, + cub::Int2Type /*ALIGNMENT*/) + { + AliasedLoadT thread_units[UINTS_PER_THREAD]; + + const AliasedLoadT* d_block_symbols = reinterpret_cast(d_chars + block_offset); + cub::LoadDirectStriped(threadIdx.x, d_block_symbols, thread_units); + +#pragma unroll + for (int32_t i = 0; i < UINTS_PER_THREAD; ++i) { + temp_storage.uints[threadIdx.x + i * BLOCK_THREADS] = thread_units[i]; + } + } + + //--------------------------------------------------------------------- + // LOADING PARTIAL BLOCK OF CHARACTERS, ALIASED + //--------------------------------------------------------------------- + __device__ __forceinline__ void LoadBlock(const CharT* d_chars, + const OffsetT block_offset, + const OffsetT num_total_symbols, + cub::Int2Type /*IS_FULL_BLOCK*/, + cub::Int2Type /*ALIGNMENT*/) + { + AliasedLoadT thread_units[UINTS_PER_THREAD]; + + if (num_total_symbols <= block_offset) return; + + // Last unit to be loaded is IDIV_CEIL(#SYM, SYMBOLS_PER_UNIT) + OffsetT num_total_units = + CUB_QUOTIENT_CEILING(num_total_symbols - block_offset, sizeof(AliasedLoadT)); + + const AliasedLoadT* d_block_symbols = reinterpret_cast(d_chars + block_offset); + cub::LoadDirectStriped( + threadIdx.x, d_block_symbols, thread_units, num_total_units); + +#pragma unroll + for (int32_t i = 0; i < UINTS_PER_THREAD; ++i) { + temp_storage.uints[threadIdx.x + i * BLOCK_THREADS] = thread_units[i]; + } + } + + //--------------------------------------------------------------------- + // LOADING BLOCK OF CHARACTERS: DISPATCHER + //--------------------------------------------------------------------- + __device__ __forceinline__ void LoadBlock(const CharT* d_chars, + const OffsetT block_offset, + const OffsetT num_total_symbols) + { + // Check if pointer is aligned to four bytes + if (((uintptr_t)(const void*)(d_chars + block_offset) % 4) == 0) { + if (block_offset + SYMBOLS_PER_UINT_BLOCK < num_total_symbols) { + LoadBlock( + d_chars, block_offset, num_total_symbols, cub::Int2Type(), cub::Int2Type<4>()); + } else { + LoadBlock( + d_chars, block_offset, num_total_symbols, cub::Int2Type(), cub::Int2Type<1>()); + } + } else { + if (block_offset + SYMBOLS_PER_UINT_BLOCK < num_total_symbols) { + LoadBlock( + d_chars, block_offset, num_total_symbols, cub::Int2Type(), cub::Int2Type<1>()); + } else { + LoadBlock( + d_chars, block_offset, num_total_symbols, cub::Int2Type(), cub::Int2Type<1>()); + } + } + } + + template + __device__ __forceinline__ void GetThreadStateTransitionVector( + const SymbolMatcherT& symbol_matcher, + const TransitionTableT& transition_table, + const CharT* d_chars, + const OffsetT block_offset, + const OffsetT num_total_symbols, + StateVectorT& state_vector) + { + using StateVectorTransitionOpT = StateVectorTransitionOp; + + // Start parsing and to transition states + StateVectorTransitionOpT transition_op(transition_table, state_vector); + + // Load characters into shared memory + LoadBlock(d_chars, block_offset, num_total_symbols); + + // If this is a full block (i.e., all threads can parse all their symbols) + OffsetT num_block_chars = num_total_symbols - block_offset; + bool is_full_block = (num_block_chars >= SYMBOLS_PER_BLOCK); + + // Ensure characters have been loaded + __syncthreads(); + + // Thread's symbols + CharT* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD]; + + // Parse thread's symbols and transition the state-vector + if (is_full_block) { + GetThreadStateTransitions( + symbol_matcher, t_chars, num_block_chars, transition_op, cub::Int2Type()); + } else { + GetThreadStateTransitions( + symbol_matcher, t_chars, num_block_chars, transition_op, cub::Int2Type()); + } + + // transition_op.TearDown(); + } + + template + __device__ __forceinline__ void GetThreadStateTransitions( + SymbolMatcherT const& symbol_matcher, + TransitionTableT const& transition_table, + CharT const* d_chars, + OffsetT const block_offset, + OffsetT const num_total_symbols, + StateVectorT& state_vector, + CallbackOpT& callback_op, + cub::Int2Type /**/) + { + using StateTransitionOpT = StateTransitionOp; + + // Start parsing and to transition states + StateTransitionOpT transition_op(transition_table, state_vector, callback_op); + + // Load characters into shared memory + if (!BYPASS_LOAD) LoadBlock(d_chars, block_offset, num_total_symbols); + + // If this is a full block (i.e., all threads can parse all their symbols) + OffsetT num_block_chars = num_total_symbols - block_offset; + bool is_full_block = (num_block_chars >= SYMBOLS_PER_BLOCK); + + // Ensure characters have been loaded + __syncthreads(); + + // Thread's symbols + CharT* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD]; + + // Initialize callback + callback_op.Init(block_offset + threadIdx.x * SYMBOLS_PER_THREAD); + + // Parse thread's symbols and transition the state-vector + if (is_full_block) { + GetThreadStateTransitions( + symbol_matcher, t_chars, num_block_chars, transition_op, cub::Int2Type()); + } else { + GetThreadStateTransitions( + symbol_matcher, t_chars, num_block_chars, transition_op, cub::Int2Type()); + } + + callback_op.TearDown(); + } +}; + +template +__launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ + void SimulateDFAKernel(DfaT dfa, + SymbolItT d_chars, + OffsetT const num_chars, + uint32_t seed_state, + StateVectorT* __restrict__ d_thread_state_transition, + TileStateT tile_state, + OutOffsetScanTileState offset_tile_state, + TransducedOutItT transduced_out_it, + TransducedIndexOutItT transduced_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it) +{ + using StateIndexT = uint32_t; + + using AgentDfaSimT = AgentDFA; + + static constexpr uint32_t NUM_STATES = DfaT::MAX_NUM_STATES; + + enum { + BLOCK_THREADS = AgentDFAPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentDFAPolicy::ITEMS_PER_THREAD, + SYMBOLS_PER_BLOCK = AgentDfaSimT::SYMBOLS_PER_BLOCK + }; + + // Shared memory required by the DFA simulator + __shared__ typename AgentDfaSimT::TempStorage dfa_storage; + + // Shared memory required by the symbol group lookup table + __shared__ typename DfaT::SymbolGroupStorageT symbol_matcher_storage; + + // Shared memory required by the transition table + __shared__ typename DfaT::TransitionTableStorageT transition_table_storage; + + // Shared memory required by the transducer table + __shared__ typename DfaT::TranslationTableStorageT transducer_table_storage; + + // Initialize symbol group lookup table + auto symbol_matcher = dfa.InitSymbolGroupLUT(symbol_matcher_storage); + + // Initialize transition table + auto transition_table = dfa.InitTransitionTable(transition_table_storage); + + // Initialize transition table + auto transducer_table = dfa.InitTranslationTable(transducer_table_storage); + + // Set up DFA + AgentDfaSimT agent_dfa(dfa_storage); + + // Memory is the state transition vector passed on to the second stage of the algorithm + StateVectorT out_state_vector; + + // Stage 1: Compute the state-transition vector + if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) { + // StateVectorT state_vector; + MultiItemStateVector state_vector; + + // Initialize the seed state transition vector with the identity vector +#pragma unroll + for (int32_t i = 0; i < NUM_STATES; ++i) { + state_vector.Set(i, i); + } + + // Compute the state transition vector + agent_dfa.GetThreadStateTransitionVector(symbol_matcher, + transition_table, + d_chars, + blockIdx.x * SYMBOLS_PER_BLOCK, + num_chars, + state_vector); + + // Initialize the state transition vector passed on to the second stage +#pragma unroll + for (int32_t i = 0; i < NUM_STATES; ++i) { + out_state_vector.Set(i, state_vector.Get(i)); + } + + // Write out state-transition vector + if (!IS_SINGLE_PASS) { + d_thread_state_transition[blockIdx.x * BLOCK_THREADS + threadIdx.x] = out_state_vector; + } + } + // Stage 2: Perform FSM simulation + if ((!IS_TRANS_VECTOR_PASS) || IS_SINGLE_PASS) { + constexpr uint32_t SINGLE_ITEM_COUNT = 1; + MultiItemStateVector state; + + //------------------------------------------------------------------------------ + // SINGLE-PASS: + // -> block-wide inclusive prefix scan on the state transition vector + // -> first block/tile: write out block aggregate as the "tile's" inclusive (i.e., the one that + // incorporates all preceding blocks/tiles results) + //------------------------------------------------------------------------------ + if (IS_SINGLE_PASS) { + uint32_t tile_idx = blockIdx.x; + using StateVectorCompositeOpT = VectorCompositeOp; + + using PrefixCallbackOpT_ = + cub::TilePrefixCallbackOp; + + using ItemsBlockScan = + cub::BlockScan; + + __shared__ typename ItemsBlockScan::TempStorage scan_temp_storage; + __shared__ typename PrefixCallbackOpT_::TempStorage prefix_callback_temp_storage; + + // STATE-TRANSITION IDENTITY VECTOR + StateVectorT state_identity_vector; + for (int32_t i = 0; i < NUM_STATES; ++i) { + state_identity_vector.Set(i, i); + } + StateVectorCompositeOpT state_vector_scan_op; + + // + if (tile_idx == 0) { + StateVectorT block_aggregate; + ItemsBlockScan(scan_temp_storage) + .ExclusiveScan(out_state_vector, + out_state_vector, + state_identity_vector, + state_vector_scan_op, + block_aggregate); + + if (threadIdx.x == 0 /*and not IS_LAST_TILE*/) { + tile_state.SetInclusive(0, block_aggregate); + } + } else { + auto prefix_op = PrefixCallbackOpT_( + tile_state, prefix_callback_temp_storage, state_vector_scan_op, tile_idx); + + ItemsBlockScan(scan_temp_storage) + .ExclusiveScan(out_state_vector, out_state_vector, state_vector_scan_op, prefix_op); + } + __syncthreads(); + state.Set(0, out_state_vector.Get(seed_state)); + } else { + state.Set( + 0, d_thread_state_transition[blockIdx.x * BLOCK_THREADS + threadIdx.x].Get(seed_state)); + } + + // Perform finite-state machine simulation, computing size of transduced output + DFASimulationCallbackWrapper + callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it); + + MultiItemStateVector t_start_state; + t_start_state.Set(0, state.Get(seed_state)); + agent_dfa.GetThreadStateTransitions(symbol_matcher, + transition_table, + d_chars, + blockIdx.x * SYMBOLS_PER_BLOCK, + num_chars, + state, + callback_wrapper, + cub::Int2Type()); + + __syncthreads(); + using OffsetPrefixScanCallbackOpT_ = + cub::TilePrefixCallbackOp; + + using OutOffsetBlockScan = + cub::BlockScan; + + __shared__ typename OutOffsetBlockScan::TempStorage scan_temp_storage; + __shared__ typename OffsetPrefixScanCallbackOpT_::TempStorage prefix_callback_temp_storage; + + uint32_t tile_idx = blockIdx.x; + if (tile_idx == 0) { + OffsetT block_aggregate = 0; + OutOffsetBlockScan(scan_temp_storage) + .ExclusiveScan(callback_wrapper.out_count, + callback_wrapper.out_count, + static_cast(0), + cub::Sum{}, + block_aggregate); + + if (threadIdx.x == 0 /*and not IS_LAST_TILE*/) { + offset_tile_state.SetInclusive(0, block_aggregate); + } + + if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) { + *d_num_transduced_out_it = block_aggregate; + } + } else { + auto prefix_op = OffsetPrefixScanCallbackOpT_( + offset_tile_state, prefix_callback_temp_storage, cub::Sum{}, tile_idx); + + OutOffsetBlockScan(scan_temp_storage) + .ExclusiveScan( + callback_wrapper.out_count, callback_wrapper.out_count, cub::Sum{}, prefix_op); + + if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) { + *d_num_transduced_out_it = prefix_op.GetInclusivePrefix(); + } + } + + callback_wrapper.write = true; + agent_dfa.GetThreadStateTransitions(symbol_matcher, + transition_table, + d_chars, + blockIdx.x * SYMBOLS_PER_BLOCK, + num_chars, + t_start_state, + callback_wrapper, + cub::Int2Type()); + } +} + +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/device_dfa.cuh b/cpp/src/io/fst/device_dfa.cuh new file mode 100644 index 00000000000..795c4c98bec --- /dev/null +++ b/cpp/src/io/fst/device_dfa.cuh @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "cub/util_type.cuh" +#include "dispatch_dfa.cuh" +#include +#include +#include + +#include + +namespace cudf { +namespace io { +namespace fst { + +/** + * @brief Uses a deterministic finite automaton to transduce a sequence of symbols from an input + * iterator to a sequence of transduced output symbols. + * + * @tparam SymbolItT Random-access input iterator type to symbols fed into the FST + * @tparam DfaT The DFA specification + * @tparam TransducedOutItT Random-access output iterator to which the transduced output will be + * written + * @tparam TransducedIndexOutItT Random-access output iterator type to which the indexes of the + * symbols that caused some output to be written. + * @tparam TransducedCountOutItT A single-item output iterator type to which the total number of + * output symbols is written + * @tparam OffsetT A type large enough to index into either of both: (a) the input symbols and (b) + * the output symbols + * @param[in] d_temp_storage Device-accessible allocation of temporary storage. When NULL, the + * required allocation size is written to \p temp_storage_bytes and no work is done. + * @param[in,out] temp_storage_bytes Reference to size in bytes of \p d_temp_storage allocation + * @param[in] dfa The DFA specifying the number of distinct symbol groups, transition table, and + * translation table + * @param[in] d_chars_in Random-access input iterator to the beginning of the sequence of input + * symbols + * @param[in] num_chars The total number of input symbols to process + * @param[out] transduced_out_it Random-access output iterator to which the transduced output is + * written + * @param[out] transduced_out_idx_it Random-access output iterator to which, the index i is written + * iff the i-th input symbol caused some output to be written + * @param[out] d_num_transduced_out_it A single-item output iterator type to which the total number + * of output symbols is written + * @param[in] seed_state The DFA's starting state. For streaming DFAs this corresponds to the + * "end-state" of the previous invocation of the algorithm. + * @param[in] stream CUDA stream to launch kernels within. Default is the null-stream. + */ +template +cudaError_t DeviceTransduce(void* d_temp_storage, + size_t& temp_storage_bytes, + DfaT dfa, + SymbolItT d_chars_in, + OffsetT num_chars, + TransducedOutItT transduced_out_it, + TransducedIndexOutItT transduced_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it, + uint32_t seed_state = 0, + cudaStream_t stream = 0) +{ + using DispatchDfaT = detail::DispatchFSM; + + return DispatchDfaT::Dispatch(d_temp_storage, + temp_storage_bytes, + dfa, + seed_state, + d_chars_in, + num_chars, + transduced_out_it, + transduced_out_idx_it, + d_num_transduced_out_it, + stream); +} + +/** + * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the + * transition table and its number of states, the mapping of symbols to symbol groups, and the + * translation table that specifies which state transitions cause which output to be written). + * + * @tparam OutSymbolT The symbol type being output by the finite-state transducer + * @tparam NUM_SYMBOLS The number of symbol groups amongst which to differentiate (one dimension of + * the transition table) + * @tparam TT_NUM_STATES The number of states defined by the DFA (the other dimension of the + * transition table) + */ +template +class Dfa { + public: + // The maximum number of states supported by this DFA instance + // This is a value queried by the DFA simulation algorithm + static constexpr int32_t MAX_NUM_STATES = TT_NUM_STATES; + + private: + // Symbol-group id lookup table + using MatcherT = detail::SingleSymbolSmemLUT; + using MatcherInitT = typename MatcherT::KernelParameter; + + // Transition table + using TransitionTableT = detail::TransitionTable; + using TransitionTableInitT = typename TransitionTableT::KernelParameter; + + // Translation lookup table + using OutSymbolOffsetT = uint32_t; + using TransducerTableT = detail::TransducerLookupTable; + using TransducerTableInitT = typename TransducerTableT::KernelParameter; + + // Private members (passed between host/device) + /// Information to initialize the device-side lookup table that maps symbol -> symbol group id + MatcherInitT symbol_matcher_init; + + /// Information to initialize the device-side transition table + TransitionTableInitT tt_init; + + /// Information to initialize the device-side translation table + TransducerTableInitT tt_out_init; + + public: + //--------------------------------------------------------------------- + // DEVICE-SIDE MEMBER FUNCTIONS + //--------------------------------------------------------------------- + using SymbolGroupStorageT = typename MatcherT::TempStorage; + using TransitionTableStorageT = typename TransitionTableT::TempStorage; + using TranslationTableStorageT = typename TransducerTableT::TempStorage; + + __device__ auto InitSymbolGroupLUT(SymbolGroupStorageT& temp_storage) + { + return MatcherT(symbol_matcher_init, temp_storage); + } + + __device__ auto InitTransitionTable(TransitionTableStorageT& temp_storage) + { + return TransitionTableT(tt_init, temp_storage); + } + + __device__ auto InitTranslationTable(TranslationTableStorageT& temp_storage) + { + return TransducerTableT(tt_out_init, temp_storage); + } + + //--------------------------------------------------------------------- + // HOST-SIDE MEMBER FUNCTIONS + //--------------------------------------------------------------------- + template + cudaError_t Init(SymbolGroupIdItT const& symbol_vec, + std::vector> const& tt_vec, + std::vector>> const& out_tt_vec, + cudaStream_t stream = 0) + { + cudaError_t error = cudaSuccess; + + enum : uint32_t { MEM_SYMBOL_MATCHER = 0, MEM_TT, MEM_OUT_TT, NUM_ALLOCATIONS }; + + size_t allocation_sizes[NUM_ALLOCATIONS] = {0}; + void* allocations[NUM_ALLOCATIONS] = {0}; + + // Memory requirements: lookup table + error = MatcherT::PrepareLUT( + nullptr, allocation_sizes[MEM_SYMBOL_MATCHER], symbol_vec, symbol_matcher_init); + if (error) return error; + + // Memory requirements: transition table + error = + TransitionTableT::CreateTransitionTable(nullptr, allocation_sizes[MEM_TT], tt_vec, tt_init); + if (error) return error; + + // Memory requirements: transducer table + error = TransducerTableT::CreateTransitionTable( + nullptr, allocation_sizes[MEM_OUT_TT], out_tt_vec, tt_out_init); + if (error) return error; + + // Memory requirements: total memory + size_t temp_storage_bytes = 0; + error = cub::AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes); + if (error) return error; + + // Allocate memory + void* d_temp_storage = nullptr; + error = cudaMalloc(&d_temp_storage, temp_storage_bytes); + if (error) return error; + + // Alias memory + error = + cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); + if (error) return error; + + // Initialize symbol group lookup table + error = MatcherT::PrepareLUT(allocations[MEM_SYMBOL_MATCHER], + allocation_sizes[MEM_SYMBOL_MATCHER], + symbol_vec, + symbol_matcher_init, + stream); + if (error) return error; + + // Initialize state transition table + error = TransitionTableT::CreateTransitionTable( + allocations[MEM_TT], allocation_sizes[MEM_TT], tt_vec, tt_init, stream); + if (error) return error; + + // Initialize finite-state transducer lookup table + error = TransducerTableT::CreateTransitionTable( + allocations[MEM_OUT_TT], allocation_sizes[MEM_OUT_TT], out_tt_vec, tt_out_init, stream); + if (error) return error; + + return error; + } + + template + cudaError_t Transduce(void* d_temp_storage, + size_t& temp_storage_bytes, + SymbolT const* d_chars, + OffsetT num_chars, + TransducedOutItT d_out_it, + TransducedIndexOutItT d_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it, + const uint32_t seed_state = 0, + cudaStream_t stream = 0) + { + return DeviceTransduce(d_temp_storage, + temp_storage_bytes, + *this, + d_chars, + num_chars, + d_out_it, + d_out_idx_it, + d_num_transduced_out_it, + seed_state, + stream); + } +}; + +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh new file mode 100644 index 00000000000..fc14faaf10a --- /dev/null +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -0,0 +1,462 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "agent_dfa.cuh" +#include "in_reg_array.cuh" + +#include + +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { + +/** + * @brief The tuning policy comprising all the architecture-specific compile-time tuning parameters. + * + * @tparam _BLOCK_THREADS Number of threads per block + * @tparam _ITEMS_PER_THREAD Number of symbols processed by each thread + */ +template +struct AgentDFAPolicy { + // The number of threads per block + static constexpr int32_t BLOCK_THREADS = _BLOCK_THREADS; + + // The number of symbols processed by each thread + static constexpr int32_t ITEMS_PER_THREAD = _ITEMS_PER_THREAD; +}; + +/** + * @brief The list of architecture-specific tuning policies. Yet TBD. + */ +struct DeviceFSMPolicy { + //------------------------------------------------------------------------------ + // Architecture-specific tuning policies + //------------------------------------------------------------------------------ + struct Policy900 : cub::ChainedPolicy<900, Policy900, Policy900> { + enum { + BLOCK_THREADS = 128, + ITEMS_PER_THREAD = 32, + }; + + using AgentDFAPolicy = AgentDFAPolicy; + }; + + // Top-of-list of the tuning policy "chain" + using MaxPolicy = Policy900; +}; + +/** + * @brief Kernel for initializing single-pass prefix scan tile states + * + * @param items_state The tile state + * @param num_tiles The number of tiles to be initialized + * @return + */ +template +__global__ void initialization_pass_kernel(TileState items_state, uint32_t num_tiles) +{ + items_state.InitializeStatus(num_tiles); +} + +template +struct DispatchFSM : DeviceFSMPolicy { + //------------------------------------------------------------------------------ + // DEFAULT TYPES + //------------------------------------------------------------------------------ + using StateIndexT = uint32_t; + using BlockOffsetT = uint32_t; + + //------------------------------------------------------------------------------ + // DERIVED CONFIGS + //------------------------------------------------------------------------------ + // DFA-specific configs + static constexpr int32_t MAX_NUM_STATES = DfaT::MAX_NUM_STATES; + static constexpr int32_t MAX_NUM_SYMBOLS = DfaT::MAX_NUM_SYMBOLS; + + // Whether to use a single-pass prefix scan that does all in on + static constexpr bool SINGLE_PASS_STV = false; + + // Whether this is a finite-state transform + static constexpr bool IS_FST = true; + + //------------------------------------------------------------------------------ + // TYPEDEFS + //------------------------------------------------------------------------------ + using StateVectorCompositeOpT = VectorCompositeOp; + + //------------------------------------------------------------------------------ + // MEMBER VARS + //------------------------------------------------------------------------------ + void* d_temp_storage; + size_t& temp_storage_bytes; + DfaT dfa; + StateIndexT seed_state; + SymbolItT d_chars_in; + OffsetT num_chars; + TransducedOutItT transduced_out_it; + TransducedIndexOutItT transduced_out_idx_it; + TransducedCountOutItT d_num_transduced_out_it; + cudaStream_t stream; + int ptx_version; + + //------------------------------------------------------------------------------ + // CONSTRUCTOR + //------------------------------------------------------------------------------ + CUB_RUNTIME_FUNCTION __forceinline__ DispatchFSM(void* d_temp_storage, + size_t& temp_storage_bytes, + DfaT dfa, + StateIndexT seed_state, + SymbolItT d_chars_in, + OffsetT num_chars, + TransducedOutItT transduced_out_it, + TransducedIndexOutItT transduced_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it, + cudaStream_t stream, + int ptx_version) + : d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + dfa(dfa), + seed_state(seed_state), + d_chars_in(d_chars_in), + num_chars(num_chars), + transduced_out_it(transduced_out_it), + transduced_out_idx_it(transduced_out_idx_it), + d_num_transduced_out_it(d_num_transduced_out_it), + stream(stream), + ptx_version(ptx_version) + { + } + + //------------------------------------------------------------------------------ + // DISPATCH INTERFACE + //------------------------------------------------------------------------------ + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( + void* d_temp_storage, + size_t& temp_storage_bytes, + DfaT dfa, + StateIndexT seed_state, + SymbolItT d_chars_in, + OffsetT num_chars, + TransducedOutItT transduced_out_it, + TransducedIndexOutItT transduced_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it, + cudaStream_t stream) + { + using MaxPolicyT = DispatchFSM::MaxPolicy; + + cudaError_t error; + + // Get PTX version + int ptx_version; + error = cub::PtxVersion(ptx_version); + if (error) return error; + + // Create dispatch functor + DispatchFSM dispatch(d_temp_storage, + temp_storage_bytes, + dfa, + seed_state, + d_chars_in, + num_chars, + transduced_out_it, + transduced_out_idx_it, + d_num_transduced_out_it, + stream, + ptx_version); + + error = MaxPolicyT::Invoke(ptx_version, dispatch); + return error; + } + + //------------------------------------------------------------------------------ + // DFA SIMULATION KERNEL INVOCATION + //------------------------------------------------------------------------------ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t + InvokeDFASimulationKernel(DFASimulationKernelT dfa_kernel, + int32_t sm_count, + StateIndexT seed_state, + StateVectorT* d_thread_state_transition, + TileStateT tile_state, + FstScanTileStateT fst_tile_state) + + { + cudaError_t error = cudaSuccess; + cub::KernelConfig dfa_simulation_config; + + using PolicyT = typename ActivePolicyT::AgentDFAPolicy; + if (CubDebug(error = dfa_simulation_config.Init(dfa_kernel))) return error; + + // Kernel invocation + uint32_t grid_size = + CUB_QUOTIENT_CEILING(num_chars, PolicyT::BLOCK_THREADS * PolicyT::ITEMS_PER_THREAD); + uint32_t block_threads = dfa_simulation_config.block_threads; + + dfa_kernel<<>>(dfa, + d_chars_in, + num_chars, + seed_state, + d_thread_state_transition, + tile_state, + fst_tile_state, + transduced_out_it, + transduced_out_idx_it, + d_num_transduced_out_it); + + // Check for errors + if (CubDebug(error = cudaPeekAtLastError())) return error; + + return error; + } + + /** + * @brief Computes the state-transition vectors + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t + ComputeStateTransitionVector(uint32_t sm_count, + TileStateT tile_state, + FstScanTileStateT fst_tile_state, + StateVectorT* d_thread_state_transition) + { + StateIndexT seed_state = 0; + + return InvokeDFASimulationKernel( + SimulateDFAKernel, + sm_count, + seed_state, + d_thread_state_transition, + tile_state, + fst_tile_state); + } + + /** + * @brief Performs the actual DFA simulation. + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t + SimulateDFA(uint32_t sm_count, + TileStateT tile_state, + FstScanTileStateT fst_tile_state, + StateIndexT seed_state, + StateVectorT* d_thread_state_transition) + { + return InvokeDFASimulationKernel( + SimulateDFAKernel, + sm_count, + seed_state, + d_thread_state_transition, + tile_state, + fst_tile_state); + } + + //------------------------------------------------------------------------------ + // POLICY INVOKATION + //------------------------------------------------------------------------------ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() + { + cudaError_t error = cudaSuccess; + + // Get SM count + int device_ordinal; + int sm_count; + + // Get current device + error = cudaGetDevice(&device_ordinal); + if (error) + + error = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal); + if (error) return error; + + //------------------------------------------------------------------------------ + // DERIVED TYPEDEFS + //------------------------------------------------------------------------------ + // Type used to represent state-transition vectors + using StateVectorT = MultiFragmentInRegArray; + + // Scan tile state used for propagating composed state transition vectors + using ScanTileStateT = typename cub::ScanTileState; + + // Scan tile state used for propagating transduced output offsets + using FstScanTileStateT = typename cub::ScanTileState; + + // STATE-TRANSITION IDENTITY VECTOR + StateVectorT state_identity_vector; + for (int32_t i = 0; i < MAX_NUM_STATES; ++i) { + state_identity_vector.Set(i, i); + } + StateVectorCompositeOpT state_vector_scan_op; + + //------------------------------------------------------------------------------ + // DERIVED CONFIGS + //------------------------------------------------------------------------------ + enum { + BLOCK_THREADS = ActivePolicyT::BLOCK_THREADS, + SYMBOLS_PER_THREAD = ActivePolicyT::ITEMS_PER_THREAD, + NUM_SYMBOLS_PER_BLOCK = BLOCK_THREADS * SYMBOLS_PER_THREAD + }; + + BlockOffsetT num_blocks = CUB_QUOTIENT_CEILING(num_chars, NUM_SYMBOLS_PER_BLOCK); + size_t num_threads = num_blocks * BLOCK_THREADS; + + //------------------------------------------------------------------------------ + // TEMPORARY MEMORY REQUIREMENTS + //------------------------------------------------------------------------------ + enum { MEM_STATE_VECTORS = 0, MEM_SCAN, MEM_SINGLE_PASS_STV, MEM_FST_OFFSET, NUM_ALLOCATIONS }; + + size_t allocation_sizes[NUM_ALLOCATIONS] = {0}; + void* allocations[NUM_ALLOCATIONS] = {0}; + + size_t vector_scan_storage_bytes = 0; + + // [MEMORY REQUIREMENTS] STATE-TRANSITION SCAN + cub::DeviceScan::ExclusiveScan(nullptr, + vector_scan_storage_bytes, + static_cast(allocations[MEM_STATE_VECTORS]), + static_cast(allocations[MEM_STATE_VECTORS]), + state_vector_scan_op, + state_identity_vector, + num_threads, + stream); + + allocation_sizes[MEM_STATE_VECTORS] = num_threads * sizeof(StateVectorT); + allocation_sizes[MEM_SCAN] = vector_scan_storage_bytes; + + // Bytes needed for tile status descriptors (fusing state-transition vector + DFA simulation) + if (SINGLE_PASS_STV) { + error = ScanTileStateT::AllocationSize(num_blocks, allocation_sizes[MEM_SINGLE_PASS_STV]); + if (error) return error; + } + + // Bytes needed for tile status descriptors (DFA simulation pass for output size computation + + // output-generating pass) + if (IS_FST) { + error = FstScanTileStateT::AllocationSize(num_blocks, allocation_sizes[MEM_FST_OFFSET]); + if (error) return error; + } + + // Alias the temporary allocations from the single storage blob (or compute the necessary size + // of the blob) + error = + cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); + if (error) return error; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) return cudaSuccess; + + // Alias memory for state-transition vectors + StateVectorT* d_thread_state_transition = + static_cast(allocations[MEM_STATE_VECTORS]); + + //------------------------------------------------------------------------------ + // INITIALIZE SCAN TILE STATES COMPUTING TRANSDUCED OUTPUT OFFSETS + //------------------------------------------------------------------------------ + FstScanTileStateT fst_offset_tile_state; + if (IS_FST) { + // Construct the tile status (aliases memory internally et al.) + error = fst_offset_tile_state.Init( + num_blocks, allocations[MEM_FST_OFFSET], allocation_sizes[MEM_FST_OFFSET]); + if (error) return error; + constexpr uint32_t FST_INIT_TPB = 256; + uint32_t num_fst_init_blocks = CUB_QUOTIENT_CEILING(num_blocks, FST_INIT_TPB); + initialization_pass_kernel<<>>( + fst_offset_tile_state, num_blocks); + } + + //------------------------------------------------------------------------------ + // COMPUTE STATE-TRANSITION VECTORS + //------------------------------------------------------------------------------ + ScanTileStateT stv_tile_state; + if constexpr(SINGLE_PASS_STV) { + // Construct the tile status (aliases memory internally et al.) + error = stv_tile_state.Init( + num_blocks, allocations[MEM_SINGLE_PASS_STV], allocation_sizes[MEM_SINGLE_PASS_STV]); + if (error) return error; + constexpr uint32_t STV_INIT_TPB = 256; + uint32_t num_stv_init_blocks = CUB_QUOTIENT_CEILING(num_blocks, STV_INIT_TPB); + initialization_pass_kernel<<>>(stv_tile_state, + num_blocks); + } else { + // Compute state-transition vectors + // TODO tag dispatch or constexpr if depending on single-pass config to avoid superfluous + // template instantiations + ComputeStateTransitionVector( + sm_count, stv_tile_state, fst_offset_tile_state, d_thread_state_transition); + + // State-transition vector scan computing using the composition operator + cub::DeviceScan::ExclusiveScan(allocations[MEM_SCAN], + allocation_sizes[MEM_SCAN], + d_thread_state_transition, + d_thread_state_transition, + state_vector_scan_op, + state_identity_vector, + num_threads, + stream); + } + + //------------------------------------------------------------------------------ + // SIMULATE DFA + //------------------------------------------------------------------------------ + return SimulateDFA( + sm_count, stv_tile_state, fst_offset_tile_state, seed_state, d_thread_state_transition); + } +}; +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh new file mode 100644 index 00000000000..f9619c82fe8 --- /dev/null +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { + +/** + * @brief A bit-packed array of items that can be backed by registers yet allows to be dynamically + * addressed at runtime. The data struture is explained in greater detail in the paper ParPaRaw: Massively Parallel Parsing of + * Delimiter-Separated Raw Data. + * + * @tparam NUM_ITEMS The maximum number of items this data structure is supposed to store + * @tparam MAX_ITEM_VALUE The maximum value that one item can represent + * @tparam BackingFragmentT The data type that is holding the fragments + */ +template +struct MultiFragmentInRegArray { + /// [b] Minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE] + static constexpr uint32_t MIN_BITS_PER_ITEM = + (MAX_ITEM_VALUE == 0) ? 1 : cub::Log2<(MAX_ITEM_VALUE + 1)>::VALUE; + + /// Number of bits that each fragment can store + static constexpr uint32_t NUM_BITS_PER_FRAGMENT = sizeof(BackingFragmentT) * 8; + + /// [a] The number of bits per fragment per item in the array + static constexpr uint32_t AVAIL_BITS_PER_FRAG_ITEM = NUM_BITS_PER_FRAGMENT / NUM_ITEMS; + + /// [k] The number of bits per item per fragment to be a power of two to avoid costly integer + /// multiplication + /// TODO: specialise for VOLTA and later architectures that have efficient integer multiplication + static constexpr uint32_t BITS_PER_FRAG_ITEM = + 0x01U << (cub::Log2<(AVAIL_BITS_PER_FRAG_ITEM + 1)>::VALUE - 1); + static constexpr uint32_t LOG2_BITS_PER_FRAG_ITEM = cub::Log2::VALUE; + + // [f] Number of fragments required to store and to reconstruct an item + static constexpr uint32_t FRAGMENTS_PER_ITEM = + (MIN_BITS_PER_ITEM + BITS_PER_FRAG_ITEM - 1) / BITS_PER_FRAG_ITEM; + + //------------------------------------------------------------------------------ + // MEMBER VARIABLES + //------------------------------------------------------------------------------ + __device__ __host__ __forceinline__ unsigned int bfe(const unsigned int& data, + unsigned int bit_start, + unsigned int num_bits) const + { +#if CUB_PTX_ARCH > 0 + return cub::BFE(data, bit_start, num_bits); +#else + const unsigned int MASK = (1 << num_bits) - 1; + return (data >> bit_start) & MASK; +#endif + } + + __device__ __host__ __forceinline__ void bfi(unsigned int& data, + unsigned int bits, + unsigned int bit_start, + unsigned int num_bits) const + { +#if CUB_PTX_ARCH > 0 + cub::BFI(data, data, bits, bit_start, num_bits); +#else + unsigned int x = bits << bit_start; + unsigned int y = data; + unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; + unsigned int MASK_Y = ~MASK_X; + data = (y & MASK_Y) | (x & MASK_X); +#endif + } + + BackingFragmentT data[FRAGMENTS_PER_ITEM]; + + //------------------------------------------------------------------------------ + // ACCESSORS + //------------------------------------------------------------------------------ + __host__ __device__ __forceinline__ uint32_t Get(int32_t index) const + { + uint32_t val = 0; + + // #pragma unroll + for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { + val = val | bfe(data[i], index * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM) + << (i * BITS_PER_FRAG_ITEM); + } + return val; + } + + __host__ __device__ __forceinline__ void Set(uint32_t index, uint32_t value) + { + // #pragma unroll + for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { + uint32_t frag_bits = bfe(value, i * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM); + bfi(data[i], frag_bits, index * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM); + } + } + + //------------------------------------------------------------------------------ + // CONSTRUCTORS + //------------------------------------------------------------------------------ + __host__ __device__ __forceinline__ MultiFragmentInRegArray() + { + for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { + data[i] = 0; + } + } + + __host__ __device__ __forceinline__ MultiFragmentInRegArray(uint32_t const (&array)[NUM_ITEMS]) + { + for (uint32_t i = 0; i < NUM_ITEMS; ++i) { + Set(i, array[i]); + } + } +}; + +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/symbol_lut.cuh b/cpp/src/io/fst/symbol_lut.cuh new file mode 100644 index 00000000000..08d5f4db58d --- /dev/null +++ b/cpp/src/io/fst/symbol_lut.cuh @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { +/** + * @brief Class template that can be plugged into the finite-state machine to look up the symbol + * group index for a given symbol. Class template does not support multi-symbol lookups (i.e., no + * look-ahead). + * + * @tparam SymbolT The symbol type being passed in to lookup the corresponding symbol group id + */ +template +struct SingleSymbolSmemLUT { + //------------------------------------------------------------------------------ + // DEFAULT TYPEDEFS + //------------------------------------------------------------------------------ + // Type used for representing a symbol group id (i.e., what we return for a given symbol) + using SymbolGroupIdT = uint8_t; + + //------------------------------------------------------------------------------ + // DERIVED CONFIGURATIONS + //------------------------------------------------------------------------------ + /// Number of entries for every lookup (e.g., for 8-bit Symbol this is 256) + static constexpr uint32_t NUM_ENTRIES_PER_LUT = 0x01U << (sizeof(SymbolT) * 8U); + + //------------------------------------------------------------------------------ + // TYPEDEFS + //------------------------------------------------------------------------------ + + struct _TempStorage { + // d_match_meta_data[symbol] -> symbol group index + SymbolGroupIdT match_meta_data[NUM_ENTRIES_PER_LUT]; + }; + + struct KernelParameter { + // d_match_meta_data[min(symbol,num_valid_entries)] -> symbol group index + SymbolGroupIdT num_valid_entries; + + // d_match_meta_data[symbol] -> symbol group index + SymbolGroupIdT* d_match_meta_data; + }; + + struct TempStorage : cub::Uninitialized<_TempStorage> { + }; + + //------------------------------------------------------------------------------ + // HELPER METHODS + //------------------------------------------------------------------------------ + /** + * @brief + * + * @param[in] d_temp_storage Device-side temporary storage that can be used to store the lookup + * table. If no storage is provided it will return the temporary storage requirements in \p + * d_temp_storage_bytes. + * @param[in,out] d_temp_storage_bytes Amount of device-side temporary storage that can be used in + * the number of bytes + * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols + * (characters!) that correspond to the i-th symbol group index + * @param[out] kernel_param The kernel parameter object to be initialized with the given mapping + * of symbols to symbol group ids. + * @param[in] stream The stream that shall be used to cudaMemcpyAsync the lookup table + * @return + */ + template + __host__ __forceinline__ static cudaError_t PrepareLUT(void* d_temp_storage, + size_t& d_temp_storage_bytes, + SymbolGroupItT const& symbol_strings, + KernelParameter& kernel_param, + cudaStream_t stream = 0) + { + // The symbol group index to be returned if none of the given symbols match + SymbolGroupIdT no_match_id = symbol_strings.size(); + + std::vector lut(NUM_ENTRIES_PER_LUT); + SymbolGroupIdT max_base_match_val = 0; + + // Initialize all entries: by default we return the no-match-id + for (uint32_t i = 0; i < NUM_ENTRIES_PER_LUT; ++i) { + lut[i] = no_match_id; + } + + // Set up lookup table + uint32_t sg_id = 0; + for (auto const& sg_symbols : symbol_strings) { + for (auto const& sg_symbol : sg_symbols) { + max_base_match_val = std::max(max_base_match_val, static_cast(sg_symbol)); + lut[sg_symbol] = sg_id; + } + sg_id++; + } + + // Initialize the out-of-bounds lookup: d_match_meta_data[max_base_match_val+1] -> no_match_id + lut[max_base_match_val + 1] = no_match_id; + + // Alias memory / return memory requiremenets + kernel_param.num_valid_entries = max_base_match_val + 2; + if (d_temp_storage) { + cudaError_t error = cudaMemcpyAsync(d_temp_storage, + lut.data(), + kernel_param.num_valid_entries * sizeof(SymbolGroupIdT), + cudaMemcpyHostToDevice, + stream); + + kernel_param.d_match_meta_data = reinterpret_cast(d_temp_storage); + return error; + } else { + d_temp_storage_bytes = kernel_param.num_valid_entries * sizeof(SymbolGroupIdT); + return cudaSuccess; + } + + return cudaSuccess; + } + + //------------------------------------------------------------------------------ + // MEMBER VARIABLES + //------------------------------------------------------------------------------ + _TempStorage& temp_storage; + SymbolGroupIdT num_valid_entries; + + //------------------------------------------------------------------------------ + // CONSTRUCTOR + //------------------------------------------------------------------------------ + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + __host__ __device__ __forceinline__ SingleSymbolSmemLUT(KernelParameter const& kernel_param, + TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()), num_valid_entries(kernel_param.num_valid_entries) + { + // GPU-side init +#if CUB_PTX_ARCH > 0 + for (int32_t i = threadIdx.x; i < kernel_param.num_valid_entries; i += blockDim.x) { + this->temp_storage.match_meta_data[i] = kernel_param.d_match_meta_data[i]; + } + __syncthreads(); + +#else + // CPU-side init + for (std::size_t i = 0; i < kernel_param.num_luts; i++) { + this->temp_storage.match_meta_data[i] = kernel_param.d_match_meta_data[i]; + } +#endif + } + + __host__ __device__ __forceinline__ int32_t operator()(SymbolT const symbol) const + { + // Look up the symbol group for given symbol + return temp_storage.match_meta_data[min(symbol, num_valid_entries - 1)]; + } +}; + +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/transition_table.cuh b/cpp/src/io/fst/transition_table.cuh new file mode 100644 index 00000000000..97fef03d8af --- /dev/null +++ b/cpp/src/io/fst/transition_table.cuh @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { + +template +struct TransitionTable { + //------------------------------------------------------------------------------ + // DEFAULT TYPEDEFS + //------------------------------------------------------------------------------ + using ItemT = char; + + struct TransitionVectorWrapper { + const ItemT* data; + + __host__ __device__ TransitionVectorWrapper(const ItemT* data) : data(data) {} + + __host__ __device__ __forceinline__ uint32_t Get(int32_t index) const { return data[index]; } + }; + + //------------------------------------------------------------------------------ + // TYPEDEFS + //------------------------------------------------------------------------------ + using TransitionVectorT = TransitionVectorWrapper; + + struct _TempStorage { + // + ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; + }; + + struct TempStorage : cub::Uninitialized<_TempStorage> { + }; + + struct KernelParameter { + ItemT* transitions; + }; + + using LoadAliasT = std::uint32_t; + + static constexpr std::size_t NUM_AUX_MEM_BYTES = + CUB_QUOTIENT_CEILING(MAX_NUM_STATES * MAX_NUM_SYMBOLS * sizeof(ItemT), sizeof(LoadAliasT)) * + sizeof(LoadAliasT); + + //------------------------------------------------------------------------------ + // HELPER METHODS + //------------------------------------------------------------------------------ + __host__ static cudaError_t CreateTransitionTable( + void* d_temp_storage, + size_t& temp_storage_bytes, + const std::vector>& trans_table, + KernelParameter& kernel_param, + cudaStream_t stream = 0) + { + if (!d_temp_storage) { + temp_storage_bytes = NUM_AUX_MEM_BYTES; + return cudaSuccess; + } + + // trans_vectors[symbol][state] -> new_state + ItemT trans_vectors[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; + + // trans_table[state][symbol] -> new state + for (std::size_t state = 0; state < trans_table.size(); ++state) { + for (std::size_t symbol = 0; symbol < trans_table[state].size(); ++symbol) { + trans_vectors[symbol * MAX_NUM_STATES + state] = trans_table[state][symbol]; + } + } + + kernel_param.transitions = static_cast(d_temp_storage); + + // Copy transition table to device + return cudaMemcpyAsync( + d_temp_storage, trans_vectors, NUM_AUX_MEM_BYTES, cudaMemcpyHostToDevice, stream); + } + + //------------------------------------------------------------------------------ + // MEMBER VARIABLES + //------------------------------------------------------------------------------ + _TempStorage& temp_storage; + + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + //------------------------------------------------------------------------------ + // CONSTRUCTOR + //------------------------------------------------------------------------------ + __host__ __device__ __forceinline__ TransitionTable(const KernelParameter& kernel_param, + TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()) + { +#if CUB_PTX_ARCH > 0 + for (int i = threadIdx.x; i < CUB_QUOTIENT_CEILING(NUM_AUX_MEM_BYTES, sizeof(LoadAliasT)); + i += blockDim.x) { + reinterpret_cast(this->temp_storage.transitions)[i] = + reinterpret_cast(kernel_param.transitions)[i]; + } + __syncthreads(); +#else + for (int i = 0; i < kernel_param.num_luts; i++) { + this->temp_storage.transitions[i] = kernel_param.transitions[i]; + } +#endif + } + + /** + * @brief Returns a random-access iterator to lookup all the state transitions for one specific + * symbol from an arbitrary old_state, i.e., it[old_state] -> new_state. + * + * @param state_id The DFA's current state index from which we'll transition + * @param match_id The symbol group id of the symbol that we just read in + * @return + */ + template + __host__ __device__ __forceinline__ int32_t operator()(StateIndexT state_id, + SymbolIndexT match_id) const + { + return temp_storage.transitions[match_id * MAX_NUM_STATES + state_id]; + } +}; + +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/translation_table.cuh b/cpp/src/io/fst/translation_table.cuh new file mode 100644 index 00000000000..bfbfd41e3f0 --- /dev/null +++ b/cpp/src/io/fst/translation_table.cuh @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "in_reg_array.cuh" + +#include + +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { + +/** + * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols to + * output + * + * @tparam OutSymbolT The symbol type being returned + * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols + * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition + * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support + * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols + */ +template +struct TransducerLookupTable { + //------------------------------------------------------------------------------ + // TYPEDEFS + //------------------------------------------------------------------------------ + struct _TempStorage { + OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; + OutSymbolT out_symbols[MAX_TABLE_SIZE]; + }; + + struct TempStorage : cub::Uninitialized<_TempStorage> { + }; + + struct KernelParameter { + OutSymbolOffsetT* d_trans_offsets; + OutSymbolT* d_out_symbols; + }; + + //------------------------------------------------------------------------------ + // HELPER METHODS + //------------------------------------------------------------------------------ + __host__ static cudaError_t CreateTransitionTable( + void* d_temp_storage, + size_t& temp_storage_bytes, + const std::vector>>& trans_table, + KernelParameter& kernel_param, + cudaStream_t stream = 0) + { + enum { MEM_OFFSETS = 0, MEM_OUT_SYMBOLS, NUM_ALLOCATIONS }; + + size_t allocation_sizes[NUM_ALLOCATIONS] = {}; + void* allocations[NUM_ALLOCATIONS] = {}; + allocation_sizes[MEM_OFFSETS] = + (MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1) * sizeof(OutSymbolOffsetT); + allocation_sizes[MEM_OUT_SYMBOLS] = MAX_TABLE_SIZE * sizeof(OutSymbolT); + + // Alias the temporary allocations from the single storage blob (or compute the necessary size + // of the blob) + cudaError_t error = + cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); + if (error) return error; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == nullptr) return cudaSuccess; + + std::vector out_symbols; + out_symbols.reserve(MAX_TABLE_SIZE); + std::vector out_symbol_offsets; + out_symbol_offsets.reserve(MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1); + out_symbol_offsets.push_back(0); + + int st = 0; + // Iterate over the states in the transition table + for (auto const& state_trans : trans_table) { + uint32_t num_added = 0; + // Iterate over the symbols in the transition table + for (auto const& symbol_out : state_trans) { + // Insert the output symbols for this specific (state, symbol) transition + out_symbols.insert(std::end(out_symbols), std::begin(symbol_out), std::end(symbol_out)); + out_symbol_offsets.push_back(out_symbols.size()); + num_added++; + } + st++; + + // Copy the last offset for all symbols (to guarantee a proper lookup for omitted symbols of + // this state) + if (MAX_NUM_SYMBOLS > num_added) { + int32_t count = MAX_NUM_SYMBOLS - num_added; + auto begin_it = std::prev(std::end(out_symbol_offsets)); + std::copy(begin_it, begin_it + count, std::back_inserter(out_symbol_offsets)); + } + } + + // Check whether runtime-provided table size exceeds the compile-time given max. table size + if (out_symbols.size() > MAX_TABLE_SIZE) { return cudaErrorInvalidValue; } + + kernel_param.d_trans_offsets = static_cast(allocations[MEM_OFFSETS]); + kernel_param.d_out_symbols = static_cast(allocations[MEM_OUT_SYMBOLS]); + + // Copy out symbols + error = cudaMemcpyAsync(kernel_param.d_trans_offsets, + out_symbol_offsets.data(), + out_symbol_offsets.size() * sizeof(out_symbol_offsets[0]), + cudaMemcpyHostToDevice, + stream); + if (error) { return error; } + + // Copy offsets into output symbols + return cudaMemcpyAsync(kernel_param.d_out_symbols, + out_symbols.data(), + out_symbols.size() * sizeof(out_symbols[0]), + cudaMemcpyHostToDevice, + stream); + } + + //------------------------------------------------------------------------------ + // MEMBER VARIABLES + //------------------------------------------------------------------------------ + _TempStorage& temp_storage; + + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + //------------------------------------------------------------------------------ + // CONSTRUCTOR + //------------------------------------------------------------------------------ + __host__ __device__ __forceinline__ TransducerLookupTable(const KernelParameter& kernel_param, + TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()) + { + constexpr uint32_t num_offsets = MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1; +#if CUB_PTX_ARCH > 0 + for (int i = threadIdx.x; i < num_offsets; i += blockDim.x) { + this->temp_storage.out_offset[i] = kernel_param.d_trans_offsets[i]; + } + // Make sure all threads in the block can read out_symbol_offsets[num_offsets - 1] from shared + // memory + __syncthreads(); + for (int i = threadIdx.x; i < this->temp_storage.out_offset[num_offsets - 1]; i += blockDim.x) { + this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; + } + __syncthreads(); +#else + for (int i = 0; i < num_offsets; i++) { + this->temp_storage.out_symbol_offsets[i] = kernel_param.d_trans_offsets[i]; + } + for (int i = 0; i < this->temp_storage.out_symbol_offsets[i]; i++) { + this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; + } +#endif + } + + template + __host__ __device__ __forceinline__ OutSymbolT operator()(StateIndexT state_id, + SymbolIndexT match_id, + RelativeOffsetT relative_offset) const + { + auto offset = temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id] + relative_offset; + return temp_storage.out_symbols[offset]; + } + + template + __host__ __device__ __forceinline__ OutSymbolOffsetT operator()(StateIndexT state_id, + SymbolIndexT match_id) const + { + return temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id + 1] - + temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id]; + } +}; + +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 1505c5cdd1b..d70a3d9518d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -225,6 +225,7 @@ ConfigureTest(JSON_TEST io/json_test.cpp) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu) +ConfigureTest(FST_TEST io/fst/fst_test.cu) if(CUDF_ENABLE_ARROW_S3) target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED") endif() diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu new file mode 100644 index 00000000000..26bb9d47dca --- /dev/null +++ b/cpp/tests/io/fst/fst_test.cu @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include +#include + +#include "cub/cub.cuh" + +#include +#include + +namespace { + +//------------------------------------------------------------------------------ +// CPU-BASED IMPLEMENTATIONS FOR VERIFICATION +//------------------------------------------------------------------------------ +/** + * @brief CPU-based implementation of a finite-state transducer (FST). + * + * @tparam InputItT Forward input iterator type to symbols fed into the FST + * @tparam StateT Type representing states of the finite-state machine + * @tparam SymbolGroupLutT Sequence container of symbol groups. Each symbol group is a sequence + * container to symbols within that group. + * @tparam TransitionTableT Two-dimensional container type + * @tparam TransducerTableT Two-dimensional container type + * @tparam OutputItT Forward output iterator type + * @tparam IndexOutputItT Forward output iterator type + * @param[in] begin Forward iterator to the beginning of the symbol sequence + * @param[in] end Forward iterator to one past the last element of the symbol sequence + * @param[in] init_state The starting state of the finite-state machine + * @param[in] symbol_group_lut Sequence container of symbol groups. Each symbol group is a sequence + * container to symbols within that group. The index of the symbol group containing a symbol being + * read will be used as symbol_gid of the transition and translation tables. + * @param[in] transition_table The two-dimensional transition table, i.e., + * transition_table[state][symbol_gid] -> new_state + * @param[in] translation_table The two-dimensional transducer table, i.e., + * translation_table[state][symbol_gid] -> range_of_output_symbols + * @param[out] out_tape A forward output iterator to which the transduced input will be written + * @param[out] out_index_tape A forward output iterator to which indexes of the symbols that + * actually caused some output are written to + * @return A pair of iterators to one past the last element of (1) the transduced output symbol + * sequence and (2) the indexes of + */ +template +static std::pair fst_baseline(InputItT begin, + InputItT end, + StateT const& init_state, + SymbolGroupLutT symbol_group_lut, + TransitionTableT transition_table, + TransducerTableT translation_table, + OutputItT out_tape, + IndexOutputItT out_index_tape) +{ + // Initialize "FSM" with starting state + StateT state = init_state; + + // To track the symbol offset within the input that caused the FST to output + std::size_t in_offset = 0; + for (auto it = begin; it < end; it++) { + // The symbol currently being read + auto const& symbol = *it; + + std::size_t symbol_group = 0; + bool found = false; + + // Iterate over symbol groups and search for the first symbol group containing the current + // symbol + for (auto const& sg : symbol_group_lut) { + for (auto const& s : sg) + if (s == symbol) found = true; + if (found) break; + symbol_group++; + } + + // Output the translated symbols to the output tape + size_t inserted = 0; + for (auto out : translation_table[state][symbol_group]) { + // std::cout << in_offset << ": " << out << "\n"; + *out_tape = out; + ++out_tape; + inserted++; + } + + // Output the index of the current symbol, iff it caused some output to be written + if (inserted > 0) { + *out_index_tape = in_offset; + out_index_tape++; + } + + // Transition the state of the finite-state machine + state = transition_table[state][symbol_group]; + + in_offset++; + } + return {out_tape, out_index_tape}; +} + +//------------------------------------------------------------------------------ +// TEST FST SPECIFICATIONS +//------------------------------------------------------------------------------ +// FST to check for brackets and braces outside of pairs of quotes +// The state being active while being outside of a string. When encountering an opening bracket +// or curly brace, we push it onto the stack. When encountering a closing bracket or brace, we +// pop it from the stack. +constexpr uint32_t TT_OOS = 0U; + +// The state being active while being within a string (e.g., field name or a string value). We do +// not push or pop from the stack while being in this state. +constexpr uint32_t TT_STR = 1U; + +// The state being active after encountering an escape symbol (e.g., '\') while being in the TT_STR +// state. constexpr uint32_t TT_ESC = 2U; // cmt to avoid 'unused' warning + +// Total number of states +constexpr uint32_t TT_NUM_STATES = 3U; + +// Definition of the symbol groups +enum PDA_SG_ID { + OBC = 0U, ///< Opening brace SG: { + OBT, ///< Opening bracket SG: [ + CBC, ///< Closing brace SG: } + CBT, ///< Closing bracket SG: ] + QTE, ///< Quote character SG: " + ESC, ///< Escape character SG: '\' + OTR, ///< SG implicitly matching all other characters + NUM_SYMBOL_GROUPS ///< Total number of symbol groups +}; + +// Transition table +const std::vector> pda_state_tt = { + /* IN_STATE { [ } ] " \ OTHER */ + /* TT_OOS */ {TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS}, + /* TT_STR */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR, TT_STR}, + /* TT_ESC */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}; + +// Translation table (i.e., for each transition, what are the symbols that we output) +const std::vector>> pda_out_tt = { + /* IN_STATE { [ } ] " \ OTHER */ + /* TT_OOS */ {{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}, + /* TT_STR */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}, + /* TT_ESC */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}; + +// The i-th string representing all the characters of a symbol group +const std::vector pda_sgs = {"{", "[", "}", "]", "\"", "\\"}; + +// The DFA's starting state +constexpr int32_t start_state = TT_OOS; + +} // namespace + +// Base test fixture for tests +struct FstTest : public cudf::test::BaseFixture { +}; + +TEST_F(FstTest, GroundTruth) +{ + // Type used to represent the atomic symbol type used within the finite-state machine + using SymbolT = char; + + // Type sufficiently large to index symbols within the input and output (may be unsigned) + using SymbolOffsetT = uint32_t; + + // Helper class to set up transition table, symbol group lookup table, and translation table + using DfaFstT = cudf::io::fst::Dfa; + + // Prepare cuda stream for data transfers & kernels + cudaStream_t stream = nullptr; + cudaStreamCreate(&stream); + rmm::cuda_stream_view stream_view(stream); + + // Test input + std::string input = R"( {)" + R"(category": "reference",)" + R"("index:" [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "Sayings of the Century",)" + R"("price": 8.95)" + R"(} )" + R"({)" + R"("category": "reference",)" + R"("index:" [4,{},null,{"a":[]}],)" + R"("author": "Nigel Rees",)" + R"("title": "Sayings of the Century",)" + R"("price": 8.95)" + R"(} {} [] [ ])"; + + // Repeat input sample 1024x + for (std::size_t i = 0; i < 10; i++) + input += input; + + // Prepare input & output buffers + rmm::device_uvector d_input(input.size(), stream_view); + hostdevice_vector output_gpu(input.size(), stream_view); + hostdevice_vector out_indexes_gpu(input.size(), stream_view); + ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( + d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); + + // Run algorithm + DfaFstT parser; + + // Initialize DFA + ASSERT_CUDA_SUCCEEDED(parser.Init(pda_sgs, pda_state_tt, pda_out_tt, stream)); + + std::size_t temp_storage_bytes = 0; + + // Query temporary storage requirements + ASSERT_CUDA_SUCCEEDED(parser.Transduce(nullptr, + temp_storage_bytes, + d_input.data(), + static_cast(d_input.size()), + output_gpu.device_ptr(), + out_indexes_gpu.device_ptr(), + cub::DiscardOutputIterator{}, + start_state, + stream)); + + // Allocate device-side temporary storage & run algorithm + rmm::device_buffer temp_storage{temp_storage_bytes, stream_view}; + ASSERT_CUDA_SUCCEEDED(parser.Transduce(temp_storage.data(), + temp_storage_bytes, + d_input.data(), + static_cast(d_input.size()), + output_gpu.device_ptr(), + out_indexes_gpu.device_ptr(), + cub::DiscardOutputIterator{}, + start_state, + stream)); + + // Async copy results from device to host + output_gpu.device_to_host(stream_view); + out_indexes_gpu.device_to_host(stream_view); + + // Prepare CPU-side results for verification + std::string output_cpu{}; + std::vector out_index_cpu{}; + output_cpu.reserve(input.size()); + out_index_cpu.reserve(input.size()); + + // Run CPU-side algorithm + fst_baseline(std::begin(input), + std::end(input), + start_state, + pda_sgs, + pda_state_tt, + pda_out_tt, + std::back_inserter(output_cpu), + std::back_inserter(out_index_cpu)); + + // Make sure results have been copied back to host + cudaStreamSynchronize(stream); + + // Verify results + ASSERT_EQ(output_gpu.size(), output_cpu.size()); + ASSERT_EQ(out_indexes_gpu.size(), out_index_cpu.size()); + for (std::size_t i = 0; i < output_gpu.size(); i++) { + ASSERT_EQ(output_gpu.host_ptr()[i], output_cpu[i]) << "Mismatch at index #" << i; + } + for (std::size_t i = 0; i < out_indexes_gpu.size(); i++) { + ASSERT_EQ(out_indexes_gpu.host_ptr()[i], out_index_cpu[i]) << "Mismatch at index #" << i; + } +} + +CUDF_TEST_PROGRAM_MAIN() From bb162547d29ad411a005802d4d9a7a5de19fdf9b Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 20 Apr 2022 05:11:32 -0700 Subject: [PATCH 015/173] clean up & addressing review comments --- cpp/src/io/fst/agent_dfa.cuh | 28 ++++++++++++---------- cpp/src/io/fst/dispatch_dfa.cuh | 4 ++-- cpp/src/io/fst/in_reg_array.cuh | 42 ++++++++++++++++----------------- 3 files changed, 38 insertions(+), 36 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index d983f9287a9..0611973f78c 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -17,8 +17,6 @@ #include "in_reg_array.cuh" -#include - #include namespace cudf { @@ -40,13 +38,13 @@ template class MultiItemStateVector { public: template - constexpr CUDF_HOST_DEVICE void Set(IndexT index, StateIndexT value) noexcept + __host__ __device__ __forceinline__ void Set(IndexT index, StateIndexT value) noexcept { state_[index] = value; } template - constexpr CUDF_HOST_DEVICE StateIndexT Get(IndexT index) const noexcept + __host__ __device__ __forceinline__ StateIndexT Get(IndexT index) const noexcept { return state_[index]; } @@ -71,7 +69,7 @@ class MultiItemStateVector { template struct VectorCompositeOp { template - constexpr CUDF_HOST_DEVICE VectorT operator()(VectorT const& lhs, VectorT const& rhs) + __host__ __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs) { VectorT res; for (int32_t i = 0; i < NUM_ITEMS; ++i) { @@ -109,8 +107,7 @@ class DFASimulationCallbackWrapper { uint32_t count = transducer_table(old_state.Get(0), symbol_id); if (write) { for (uint32_t out_char = 0; out_char < count; out_char++) { - out_it[out_count + out_char] = - transducer_table(old_state.Get(0), symbol_id, out_char); + out_it[out_count + out_char] = transducer_table(old_state.Get(0), symbol_id, out_char); out_idx_it[out_count + out_char] = offset + character_index; } } @@ -188,8 +185,8 @@ struct StateTransitionOp { __host__ __device__ __forceinline__ void ReadSymbol(const CharIndexT& character_index, const SymbolIndexT& read_symbol_id) { - using TransitionVectorT= typename TransitionTableT::TransitionVectorT ; - old_state_vector = state_vector; + using TransitionVectorT = typename TransitionTableT::TransitionVectorT; + old_state_vector = state_vector; state_vector.Set(0, transition_table(state_vector.Get(0), read_symbol_id)); callback_op.ReadSymbol(character_index, old_state_vector, state_vector, read_symbol_id); } @@ -344,7 +341,8 @@ struct AgentDFA { { AliasedLoadT thread_units[UINTS_PER_THREAD]; - const AliasedLoadT* d_block_symbols = reinterpret_cast(d_chars + block_offset); + const AliasedLoadT* d_block_symbols = + reinterpret_cast(d_chars + block_offset); cub::LoadDirectStriped(threadIdx.x, d_block_symbols, thread_units); #pragma unroll @@ -370,7 +368,8 @@ struct AgentDFA { OffsetT num_total_units = CUB_QUOTIENT_CEILING(num_total_symbols - block_offset, sizeof(AliasedLoadT)); - const AliasedLoadT* d_block_symbols = reinterpret_cast(d_chars + block_offset); + const AliasedLoadT* d_block_symbols = + reinterpret_cast(d_chars + block_offset); cub::LoadDirectStriped( threadIdx.x, d_block_symbols, thread_units, num_total_units); @@ -419,7 +418,8 @@ struct AgentDFA { const OffsetT num_total_symbols, StateVectorT& state_vector) { - using StateVectorTransitionOpT = StateVectorTransitionOp; + using StateVectorTransitionOpT = + StateVectorTransitionOp; // Start parsing and to transition states StateVectorTransitionOpT transition_op(transition_table, state_vector); @@ -650,7 +650,9 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__ } // Perform finite-state machine simulation, computing size of transduced output - DFASimulationCallbackWrapper + DFASimulationCallbackWrapper callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it); MultiItemStateVector t_start_state; diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh index fc14faaf10a..316d6ea0d5f 100644 --- a/cpp/src/io/fst/dispatch_dfa.cuh +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -119,7 +119,7 @@ struct DispatchFSM : DeviceFSMPolicy { TransducedIndexOutItT transduced_out_idx_it; TransducedCountOutItT d_num_transduced_out_it; cudaStream_t stream; - int ptx_version; + int const ptx_version; //------------------------------------------------------------------------------ // CONSTRUCTOR @@ -422,7 +422,7 @@ struct DispatchFSM : DeviceFSMPolicy { // COMPUTE STATE-TRANSITION VECTORS //------------------------------------------------------------------------------ ScanTileStateT stv_tile_state; - if constexpr(SINGLE_PASS_STV) { + if constexpr (SINGLE_PASS_STV) { // Construct the tile status (aliases memory internally et al.) error = stv_tile_state.Init( num_blocks, allocations[MEM_SINGLE_PASS_STV], allocation_sizes[MEM_SINGLE_PASS_STV]); diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index f9619c82fe8..ed5948249d4 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -35,56 +35,55 @@ namespace detail { * @tparam BackingFragmentT The data type that is holding the fragments */ template -struct MultiFragmentInRegArray { - /// [b] Minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE] +class MultiFragmentInRegArray { + private: + /// Minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE] static constexpr uint32_t MIN_BITS_PER_ITEM = (MAX_ITEM_VALUE == 0) ? 1 : cub::Log2<(MAX_ITEM_VALUE + 1)>::VALUE; /// Number of bits that each fragment can store static constexpr uint32_t NUM_BITS_PER_FRAGMENT = sizeof(BackingFragmentT) * 8; - /// [a] The number of bits per fragment per item in the array + /// The number of bits per fragment per item in the array static constexpr uint32_t AVAIL_BITS_PER_FRAG_ITEM = NUM_BITS_PER_FRAGMENT / NUM_ITEMS; - /// [k] The number of bits per item per fragment to be a power of two to avoid costly integer + /// The number of bits per item per fragment to be a power of two to avoid costly integer /// multiplication - /// TODO: specialise for VOLTA and later architectures that have efficient integer multiplication static constexpr uint32_t BITS_PER_FRAG_ITEM = 0x01U << (cub::Log2<(AVAIL_BITS_PER_FRAG_ITEM + 1)>::VALUE - 1); - static constexpr uint32_t LOG2_BITS_PER_FRAG_ITEM = cub::Log2::VALUE; - // [f] Number of fragments required to store and to reconstruct an item + // Number of fragments required to store and to reconstruct each item static constexpr uint32_t FRAGMENTS_PER_ITEM = (MIN_BITS_PER_ITEM + BITS_PER_FRAG_ITEM - 1) / BITS_PER_FRAG_ITEM; //------------------------------------------------------------------------------ - // MEMBER VARIABLES + // HELPER FUNCTIONS //------------------------------------------------------------------------------ - __device__ __host__ __forceinline__ unsigned int bfe(const unsigned int& data, - unsigned int bit_start, - unsigned int num_bits) const + __device__ __host__ __forceinline__ uint32_t bfe(const uint32_t& data, + uint32_t bit_start, + uint32_t num_bits) const { #if CUB_PTX_ARCH > 0 return cub::BFE(data, bit_start, num_bits); #else - const unsigned int MASK = (1 << num_bits) - 1; + const uint32_t MASK = (1 << num_bits) - 1; return (data >> bit_start) & MASK; #endif } - __device__ __host__ __forceinline__ void bfi(unsigned int& data, - unsigned int bits, - unsigned int bit_start, - unsigned int num_bits) const + __device__ __host__ __forceinline__ void bfi(uint32_t& data, + uint32_t bits, + uint32_t bit_start, + uint32_t num_bits) const { #if CUB_PTX_ARCH > 0 cub::BFI(data, data, bits, bit_start, num_bits); #else - unsigned int x = bits << bit_start; - unsigned int y = data; - unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; - unsigned int MASK_Y = ~MASK_X; - data = (y & MASK_Y) | (x & MASK_X); + uint32_t x = bits << bit_start; + uint32_t y = data; + uint32_t MASK_X = ((1 << num_bits) - 1) << bit_start; + uint32_t MASK_Y = ~MASK_X; + data = (y & MASK_Y) | (x & MASK_X); #endif } @@ -93,6 +92,7 @@ struct MultiFragmentInRegArray { //------------------------------------------------------------------------------ // ACCESSORS //------------------------------------------------------------------------------ + public: __host__ __device__ __forceinline__ uint32_t Get(int32_t index) const { uint32_t val = 0; From 4e42d0e8f673f78c65daf5a0e1de5c728727f395 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 25 Apr 2022 09:59:37 -0700 Subject: [PATCH 016/173] refactored lookup tables --- cpp/src/io/fst/agent_dfa.cuh | 3 - cpp/src/io/fst/device_dfa.cuh | 192 +++++++++++++-------------- cpp/src/io/fst/symbol_lut.cuh | 94 +++++-------- cpp/src/io/fst/transition_table.cuh | 109 +++++---------- cpp/src/io/fst/translation_table.cuh | 123 +++++++---------- cpp/tests/io/fst/fst_test.cu | 5 +- 6 files changed, 209 insertions(+), 317 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 0611973f78c..3bc59160696 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -152,8 +152,6 @@ class StateVectorTransitionOp : public StateTransitionCallbackOp { __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, SymbolIndexT const read_symbol_id) const { - using TransitionVectorT = typename TransitionTableT::TransitionVectorT; - for (int32_t i = 0; i < NUM_INSTANCES; ++i) { state_vector.Set(i, transition_table(state_vector.Get(i), read_symbol_id)); } @@ -185,7 +183,6 @@ struct StateTransitionOp { __host__ __device__ __forceinline__ void ReadSymbol(const CharIndexT& character_index, const SymbolIndexT& read_symbol_id) { - using TransitionVectorT = typename TransitionTableT::TransitionVectorT; old_state_vector = state_vector; state_vector.Set(0, transition_table(state_vector.Get(0), read_symbol_id)); callback_op.ReadSymbol(character_index, old_state_vector, state_vector, read_symbol_id); diff --git a/cpp/src/io/fst/device_dfa.cuh b/cpp/src/io/fst/device_dfa.cuh index 795c4c98bec..b12283a9673 100644 --- a/cpp/src/io/fst/device_dfa.cuh +++ b/cpp/src/io/fst/device_dfa.cuh @@ -15,8 +15,9 @@ */ #pragma once -#include "cub/util_type.cuh" #include "dispatch_dfa.cuh" + +#include #include #include #include @@ -95,140 +96,121 @@ cudaError_t DeviceTransduce(void* d_temp_storage, stream); } -/** - * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the - * transition table and its number of states, the mapping of symbols to symbol groups, and the - * translation table that specifies which state transitions cause which output to be written). - * - * @tparam OutSymbolT The symbol type being output by the finite-state transducer - * @tparam NUM_SYMBOLS The number of symbol groups amongst which to differentiate (one dimension of - * the transition table) - * @tparam TT_NUM_STATES The number of states defined by the DFA (the other dimension of the - * transition table) - */ -template -class Dfa { +template +class dfa_device_view { + private: + using sgid_lut_init_t = typename SymbolGroupIdLookupT::KernelParameter; + using transition_table_init_t = typename TransitionTableT::KernelParameter; + using translation_table_init_t = typename TranslationTableT::KernelParameter; + public: // The maximum number of states supported by this DFA instance // This is a value queried by the DFA simulation algorithm - static constexpr int32_t MAX_NUM_STATES = TT_NUM_STATES; + static constexpr int32_t MAX_NUM_STATES = NUM_STATES; - private: - // Symbol-group id lookup table - using MatcherT = detail::SingleSymbolSmemLUT; - using MatcherInitT = typename MatcherT::KernelParameter; - - // Transition table - using TransitionTableT = detail::TransitionTable; - using TransitionTableInitT = typename TransitionTableT::KernelParameter; - - // Translation lookup table - using OutSymbolOffsetT = uint32_t; - using TransducerTableT = detail::TransducerLookupTable; - using TransducerTableInitT = typename TransducerTableT::KernelParameter; - - // Private members (passed between host/device) - /// Information to initialize the device-side lookup table that maps symbol -> symbol group id - MatcherInitT symbol_matcher_init; - - /// Information to initialize the device-side transition table - TransitionTableInitT tt_init; - - /// Information to initialize the device-side translation table - TransducerTableInitT tt_out_init; - - public: //--------------------------------------------------------------------- // DEVICE-SIDE MEMBER FUNCTIONS //--------------------------------------------------------------------- - using SymbolGroupStorageT = typename MatcherT::TempStorage; + using SymbolGroupStorageT = typename SymbolGroupIdLookupT::TempStorage; using TransitionTableStorageT = typename TransitionTableT::TempStorage; - using TranslationTableStorageT = typename TransducerTableT::TempStorage; + using TranslationTableStorageT = typename TranslationTableT::TempStorage; __device__ auto InitSymbolGroupLUT(SymbolGroupStorageT& temp_storage) { - return MatcherT(symbol_matcher_init, temp_storage); + return SymbolGroupIdLookupT(*d_sgid_lut_init, temp_storage); } __device__ auto InitTransitionTable(TransitionTableStorageT& temp_storage) { - return TransitionTableT(tt_init, temp_storage); + return TransitionTableT(*d_transition_table_init, temp_storage); } __device__ auto InitTranslationTable(TranslationTableStorageT& temp_storage) { - return TransducerTableT(tt_out_init, temp_storage); + return TranslationTableT(*d_translation_table_init, temp_storage); } - //--------------------------------------------------------------------- - // HOST-SIDE MEMBER FUNCTIONS - //--------------------------------------------------------------------- - template - cudaError_t Init(SymbolGroupIdItT const& symbol_vec, - std::vector> const& tt_vec, - std::vector>> const& out_tt_vec, - cudaStream_t stream = 0) + dfa_device_view(sgid_lut_init_t const* d_sgid_lut_init, + transition_table_init_t const* d_transition_table_init, + translation_table_init_t const* d_translation_table_init) + : d_sgid_lut_init(d_sgid_lut_init), + d_transition_table_init(d_transition_table_init), + d_translation_table_init(d_translation_table_init) { - cudaError_t error = cudaSuccess; - - enum : uint32_t { MEM_SYMBOL_MATCHER = 0, MEM_TT, MEM_OUT_TT, NUM_ALLOCATIONS }; + } - size_t allocation_sizes[NUM_ALLOCATIONS] = {0}; - void* allocations[NUM_ALLOCATIONS] = {0}; + private: + sgid_lut_init_t const* d_sgid_lut_init; + transition_table_init_t const* d_transition_table_init; + translation_table_init_t const* d_translation_table_init; +}; - // Memory requirements: lookup table - error = MatcherT::PrepareLUT( - nullptr, allocation_sizes[MEM_SYMBOL_MATCHER], symbol_vec, symbol_matcher_init); - if (error) return error; +/** + * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the + * transition table and its number of states, the mapping of symbols to symbol groups, and the + * translation table that specifies which state transitions cause which output to be written). + * + * @tparam OutSymbolT The symbol type being output by the finite-state transducer + * @tparam NUM_SYMBOLS The number of symbol groups amongst which to differentiate (one dimension of + * the transition table) + * @tparam NUM_STATES The number of states defined by the DFA (the other dimension of the + * transition table) + */ +template +class Dfa { + public: + // The maximum number of states supported by this DFA instance + // This is a value queried by the DFA simulation algorithm + static constexpr int32_t MAX_NUM_STATES = NUM_STATES; - // Memory requirements: transition table - error = - TransitionTableT::CreateTransitionTable(nullptr, allocation_sizes[MEM_TT], tt_vec, tt_init); - if (error) return error; + private: + // Symbol-group id lookup table + using SymbolGroupIdLookupT = detail::SingleSymbolSmemLUT; + using SymbolGroupIdInitT = typename SymbolGroupIdLookupT::KernelParameter; - // Memory requirements: transducer table - error = TransducerTableT::CreateTransitionTable( - nullptr, allocation_sizes[MEM_OUT_TT], out_tt_vec, tt_out_init); - if (error) return error; + // Transition table + using TransitionTableT = detail::TransitionTable; + using TransitionTableInitT = typename TransitionTableT::KernelParameter; - // Memory requirements: total memory - size_t temp_storage_bytes = 0; - error = cub::AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes); - if (error) return error; + // Translation lookup table + using OutSymbolOffsetT = uint32_t; + using TranslationTableT = detail::TransducerLookupTable; + using TranslationTableInitT = typename TranslationTableT::KernelParameter; + + auto get_device_view() + { + return dfa_device_view{ + sgid_init.d_begin(), transition_table_init.d_begin(), translation_table_init.d_begin()}; + } - // Allocate memory - void* d_temp_storage = nullptr; - error = cudaMalloc(&d_temp_storage, temp_storage_bytes); - if (error) return error; + public: + template + Dfa(SymbolGroupIdItT const& symbol_vec, + std::vector> const& tt_vec, + std::vector>> const& out_tt_vec, + cudaStream_t stream) + { + constexpr std::size_t single_item = 1; - // Alias memory - error = - cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); - if (error) return error; + sgid_init = hostdevice_vector{single_item, stream}; + transition_table_init = hostdevice_vector{single_item, stream}; + translation_table_init = hostdevice_vector{single_item, stream}; - // Initialize symbol group lookup table - error = MatcherT::PrepareLUT(allocations[MEM_SYMBOL_MATCHER], - allocation_sizes[MEM_SYMBOL_MATCHER], - symbol_vec, - symbol_matcher_init, - stream); - if (error) return error; + // Initialize symbol group id lookup table + SymbolGroupIdLookupT::InitDeviceSymbolGroupIdLut(sgid_init, symbol_vec, stream); // Initialize state transition table - error = TransitionTableT::CreateTransitionTable( - allocations[MEM_TT], allocation_sizes[MEM_TT], tt_vec, tt_init, stream); - if (error) return error; + TransitionTableT::InitDeviceTransitionTable(transition_table_init, tt_vec, stream); // Initialize finite-state transducer lookup table - error = TransducerTableT::CreateTransitionTable( - allocations[MEM_OUT_TT], allocation_sizes[MEM_OUT_TT], out_tt_vec, tt_out_init, stream); - if (error) return error; - - return error; + TranslationTableT::InitDeviceTranslationTable(translation_table_init, out_tt_vec, stream); } template get_device_view(), d_chars, num_chars, d_out_it, @@ -257,8 +239,12 @@ class Dfa { seed_state, stream); } -}; + private: + hostdevice_vector sgid_init{}; + hostdevice_vector transition_table_init{}; + hostdevice_vector translation_table_init{}; +}; } // namespace fst } // namespace io } // namespace cudf diff --git a/cpp/src/io/fst/symbol_lut.cuh b/cpp/src/io/fst/symbol_lut.cuh index 08d5f4db58d..abf71a7fbea 100644 --- a/cpp/src/io/fst/symbol_lut.cuh +++ b/cpp/src/io/fst/symbol_lut.cuh @@ -16,6 +16,9 @@ #pragma once +#include +#include + #include #include @@ -34,38 +37,29 @@ namespace detail { * @tparam SymbolT The symbol type being passed in to lookup the corresponding symbol group id */ template -struct SingleSymbolSmemLUT { - //------------------------------------------------------------------------------ - // DEFAULT TYPEDEFS - //------------------------------------------------------------------------------ +class SingleSymbolSmemLUT { + private: // Type used for representing a symbol group id (i.e., what we return for a given symbol) using SymbolGroupIdT = uint8_t; - //------------------------------------------------------------------------------ - // DERIVED CONFIGURATIONS - //------------------------------------------------------------------------------ /// Number of entries for every lookup (e.g., for 8-bit Symbol this is 256) static constexpr uint32_t NUM_ENTRIES_PER_LUT = 0x01U << (sizeof(SymbolT) * 8U); - //------------------------------------------------------------------------------ - // TYPEDEFS - //------------------------------------------------------------------------------ - struct _TempStorage { - // d_match_meta_data[symbol] -> symbol group index - SymbolGroupIdT match_meta_data[NUM_ENTRIES_PER_LUT]; + // sym_to_sgid[symbol] -> symbol group index + SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; }; + public: struct KernelParameter { - // d_match_meta_data[min(symbol,num_valid_entries)] -> symbol group index - SymbolGroupIdT num_valid_entries; + // sym_to_sgid[min(symbol,num_valid_entries)] -> symbol group index + SymbolT num_valid_entries; - // d_match_meta_data[symbol] -> symbol group index - SymbolGroupIdT* d_match_meta_data; + // sym_to_sgid[symbol] -> symbol group index + SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; }; - struct TempStorage : cub::Uninitialized<_TempStorage> { - }; + using TempStorage = cub::Uninitialized<_TempStorage>; //------------------------------------------------------------------------------ // HELPER METHODS @@ -73,66 +67,48 @@ struct SingleSymbolSmemLUT { /** * @brief * - * @param[in] d_temp_storage Device-side temporary storage that can be used to store the lookup - * table. If no storage is provided it will return the temporary storage requirements in \p - * d_temp_storage_bytes. - * @param[in,out] d_temp_storage_bytes Amount of device-side temporary storage that can be used in - * the number of bytes + * @param[out] sgid_init A hostdevice_vector that will be populated * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols * (characters!) that correspond to the i-th symbol group index - * @param[out] kernel_param The kernel parameter object to be initialized with the given mapping - * of symbols to symbol group ids. * @param[in] stream The stream that shall be used to cudaMemcpyAsync the lookup table * @return */ template - __host__ __forceinline__ static cudaError_t PrepareLUT(void* d_temp_storage, - size_t& d_temp_storage_bytes, - SymbolGroupItT const& symbol_strings, - KernelParameter& kernel_param, - cudaStream_t stream = 0) + static void InitDeviceSymbolGroupIdLut(hostdevice_vector& sgid_init, + SymbolGroupItT const& symbol_strings, + rmm::cuda_stream_view stream) { // The symbol group index to be returned if none of the given symbols match SymbolGroupIdT no_match_id = symbol_strings.size(); - std::vector lut(NUM_ENTRIES_PER_LUT); + // The symbol with the largest value that is mapped to a symbol group id SymbolGroupIdT max_base_match_val = 0; // Initialize all entries: by default we return the no-match-id - for (uint32_t i = 0; i < NUM_ENTRIES_PER_LUT; ++i) { - lut[i] = no_match_id; - } + std::fill(&sgid_init.host_ptr()->sym_to_sgid[0], + &sgid_init.host_ptr()->sym_to_sgid[NUM_ENTRIES_PER_LUT], + no_match_id); // Set up lookup table uint32_t sg_id = 0; + // Iterate over the symbol groups for (auto const& sg_symbols : symbol_strings) { + // Iterate over all symbols that belong to the current symbol group for (auto const& sg_symbol : sg_symbols) { max_base_match_val = std::max(max_base_match_val, static_cast(sg_symbol)); - lut[sg_symbol] = sg_id; + sgid_init.host_ptr()->sym_to_sgid[static_cast(sg_symbol)] = sg_id; } sg_id++; } - // Initialize the out-of-bounds lookup: d_match_meta_data[max_base_match_val+1] -> no_match_id - lut[max_base_match_val + 1] = no_match_id; + // Initialize the out-of-bounds lookup: sym_to_sgid[max_base_match_val+1] -> no_match_id + sgid_init.host_ptr()->sym_to_sgid[max_base_match_val + 1] = no_match_id; // Alias memory / return memory requiremenets - kernel_param.num_valid_entries = max_base_match_val + 2; - if (d_temp_storage) { - cudaError_t error = cudaMemcpyAsync(d_temp_storage, - lut.data(), - kernel_param.num_valid_entries * sizeof(SymbolGroupIdT), - cudaMemcpyHostToDevice, - stream); - - kernel_param.d_match_meta_data = reinterpret_cast(d_temp_storage); - return error; - } else { - d_temp_storage_bytes = kernel_param.num_valid_entries * sizeof(SymbolGroupIdT); - return cudaSuccess; - } + // TODO I think this could be +1? + sgid_init.host_ptr()->num_valid_entries = max_base_match_val + 2; - return cudaSuccess; + sgid_init.host_to_device(stream); } //------------------------------------------------------------------------------ @@ -150,29 +126,29 @@ struct SingleSymbolSmemLUT { return private_storage; } - __host__ __device__ __forceinline__ SingleSymbolSmemLUT(KernelParameter const& kernel_param, - TempStorage& temp_storage) + constexpr CUDF_HOST_DEVICE SingleSymbolSmemLUT(KernelParameter const& kernel_param, + TempStorage& temp_storage) : temp_storage(temp_storage.Alias()), num_valid_entries(kernel_param.num_valid_entries) { // GPU-side init #if CUB_PTX_ARCH > 0 for (int32_t i = threadIdx.x; i < kernel_param.num_valid_entries; i += blockDim.x) { - this->temp_storage.match_meta_data[i] = kernel_param.d_match_meta_data[i]; + this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i]; } __syncthreads(); #else // CPU-side init for (std::size_t i = 0; i < kernel_param.num_luts; i++) { - this->temp_storage.match_meta_data[i] = kernel_param.d_match_meta_data[i]; + this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i]; } #endif } - __host__ __device__ __forceinline__ int32_t operator()(SymbolT const symbol) const + constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT const symbol) const { // Look up the symbol group for given symbol - return temp_storage.match_meta_data[min(symbol, num_valid_entries - 1)]; + return temp_storage.sym_to_sgid[min(symbol, num_valid_entries - 1)]; } }; diff --git a/cpp/src/io/fst/transition_table.cuh b/cpp/src/io/fst/transition_table.cuh index 97fef03d8af..5eccb926974 100644 --- a/cpp/src/io/fst/transition_table.cuh +++ b/cpp/src/io/fst/transition_table.cuh @@ -16,6 +16,10 @@ #pragma once +#include +#include +#include + #include #include @@ -25,103 +29,50 @@ namespace io { namespace fst { namespace detail { -template -struct TransitionTable { - //------------------------------------------------------------------------------ - // DEFAULT TYPEDEFS - //------------------------------------------------------------------------------ +template +class TransitionTable { + private: + // Type used using ItemT = char; - struct TransitionVectorWrapper { - const ItemT* data; - - __host__ __device__ TransitionVectorWrapper(const ItemT* data) : data(data) {} - - __host__ __device__ __forceinline__ uint32_t Get(int32_t index) const { return data[index]; } - }; - - //------------------------------------------------------------------------------ - // TYPEDEFS - //------------------------------------------------------------------------------ - using TransitionVectorT = TransitionVectorWrapper; - struct _TempStorage { - // ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; }; - struct TempStorage : cub::Uninitialized<_TempStorage> { - }; + public: + using TempStorage = cub::Uninitialized<_TempStorage>; struct KernelParameter { - ItemT* transitions; + ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; }; - using LoadAliasT = std::uint32_t; - - static constexpr std::size_t NUM_AUX_MEM_BYTES = - CUB_QUOTIENT_CEILING(MAX_NUM_STATES * MAX_NUM_SYMBOLS * sizeof(ItemT), sizeof(LoadAliasT)) * - sizeof(LoadAliasT); - - //------------------------------------------------------------------------------ - // HELPER METHODS - //------------------------------------------------------------------------------ - __host__ static cudaError_t CreateTransitionTable( - void* d_temp_storage, - size_t& temp_storage_bytes, - const std::vector>& trans_table, - KernelParameter& kernel_param, - cudaStream_t stream = 0) + static void InitDeviceTransitionTable(hostdevice_vector& transition_table_init, + const std::vector>& trans_table, + rmm::cuda_stream_view stream) { - if (!d_temp_storage) { - temp_storage_bytes = NUM_AUX_MEM_BYTES; - return cudaSuccess; - } - - // trans_vectors[symbol][state] -> new_state - ItemT trans_vectors[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; - // trans_table[state][symbol] -> new state for (std::size_t state = 0; state < trans_table.size(); ++state) { for (std::size_t symbol = 0; symbol < trans_table[state].size(); ++symbol) { - trans_vectors[symbol * MAX_NUM_STATES + state] = trans_table[state][symbol]; + transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] = + trans_table[state][symbol]; } } - kernel_param.transitions = static_cast(d_temp_storage); - // Copy transition table to device - return cudaMemcpyAsync( - d_temp_storage, trans_vectors, NUM_AUX_MEM_BYTES, cudaMemcpyHostToDevice, stream); + transition_table_init.host_to_device(stream); } - //------------------------------------------------------------------------------ - // MEMBER VARIABLES - //------------------------------------------------------------------------------ - _TempStorage& temp_storage; - - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - //------------------------------------------------------------------------------ - // CONSTRUCTOR - //------------------------------------------------------------------------------ - __host__ __device__ __forceinline__ TransitionTable(const KernelParameter& kernel_param, - TempStorage& temp_storage) + constexpr CUDF_HOST_DEVICE TransitionTable(const KernelParameter& kernel_param, + TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) { #if CUB_PTX_ARCH > 0 - for (int i = threadIdx.x; i < CUB_QUOTIENT_CEILING(NUM_AUX_MEM_BYTES, sizeof(LoadAliasT)); - i += blockDim.x) { - reinterpret_cast(this->temp_storage.transitions)[i] = - reinterpret_cast(kernel_param.transitions)[i]; + for (int i = threadIdx.x; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i += blockDim.x) { + this->temp_storage.transitions[i] = kernel_param.transitions[i]; } __syncthreads(); #else - for (int i = 0; i < kernel_param.num_luts; i++) { + for (int i = 0; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i++) { this->temp_storage.transitions[i] = kernel_param.transitions[i]; } #endif @@ -136,11 +87,21 @@ struct TransitionTable { * @return */ template - __host__ __device__ __forceinline__ int32_t operator()(StateIndexT state_id, - SymbolIndexT match_id) const + constexpr CUDF_HOST_DEVICE int32_t operator()(StateIndexT const state_id, + SymbolIndexT const match_id) const { return temp_storage.transitions[match_id * MAX_NUM_STATES + state_id]; - } + } + + private: + _TempStorage& temp_storage; + + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + + return private_storage; + } }; } // namespace detail diff --git a/cpp/src/io/fst/translation_table.cuh b/cpp/src/io/fst/translation_table.cuh index bfbfd41e3f0..89da994606c 100644 --- a/cpp/src/io/fst/translation_table.cuh +++ b/cpp/src/io/fst/translation_table.cuh @@ -16,7 +16,12 @@ #pragma once -#include "in_reg_array.cuh" +#include +#include +#include +#include + +#include "rmm/device_uvector.hpp" #include @@ -28,10 +33,10 @@ namespace fst { namespace detail { /** - * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols to - * output + * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols + * that the finite-state transducer is supposed to output for each transition * - * @tparam OutSymbolT The symbol type being returned + * @tparam OutSymbolT The symbol type being output * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support @@ -42,57 +47,35 @@ template -struct TransducerLookupTable { - //------------------------------------------------------------------------------ - // TYPEDEFS - //------------------------------------------------------------------------------ +class TransducerLookupTable { + private: struct _TempStorage { OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; OutSymbolT out_symbols[MAX_TABLE_SIZE]; }; - struct TempStorage : cub::Uninitialized<_TempStorage> { - }; + public: + using TempStorage = cub::Uninitialized<_TempStorage>; struct KernelParameter { - OutSymbolOffsetT* d_trans_offsets; - OutSymbolT* d_out_symbols; + OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; + OutSymbolT d_out_symbols[MAX_TABLE_SIZE]; }; - //------------------------------------------------------------------------------ - // HELPER METHODS - //------------------------------------------------------------------------------ - __host__ static cudaError_t CreateTransitionTable( - void* d_temp_storage, - size_t& temp_storage_bytes, - const std::vector>>& trans_table, - KernelParameter& kernel_param, - cudaStream_t stream = 0) + /** + * @brief Initializes the translation table (both the host and device parts) + */ + static void InitDeviceTranslationTable( + hostdevice_vector& translation_table_init, + std::vector>> const& trans_table, + rmm::cuda_stream_view stream) { - enum { MEM_OFFSETS = 0, MEM_OUT_SYMBOLS, NUM_ALLOCATIONS }; - - size_t allocation_sizes[NUM_ALLOCATIONS] = {}; - void* allocations[NUM_ALLOCATIONS] = {}; - allocation_sizes[MEM_OFFSETS] = - (MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1) * sizeof(OutSymbolOffsetT); - allocation_sizes[MEM_OUT_SYMBOLS] = MAX_TABLE_SIZE * sizeof(OutSymbolT); - - // Alias the temporary allocations from the single storage blob (or compute the necessary size - // of the blob) - cudaError_t error = - cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); - if (error) return error; - - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == nullptr) return cudaSuccess; - std::vector out_symbols; out_symbols.reserve(MAX_TABLE_SIZE); std::vector out_symbol_offsets; out_symbol_offsets.reserve(MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1); out_symbol_offsets.push_back(0); - int st = 0; // Iterate over the states in the transition table for (auto const& state_trans : trans_table) { uint32_t num_added = 0; @@ -103,7 +86,6 @@ struct TransducerLookupTable { out_symbol_offsets.push_back(out_symbols.size()); num_added++; } - st++; // Copy the last offset for all symbols (to guarantee a proper lookup for omitted symbols of // this state) @@ -115,30 +97,21 @@ struct TransducerLookupTable { } // Check whether runtime-provided table size exceeds the compile-time given max. table size - if (out_symbols.size() > MAX_TABLE_SIZE) { return cudaErrorInvalidValue; } - - kernel_param.d_trans_offsets = static_cast(allocations[MEM_OFFSETS]); - kernel_param.d_out_symbols = static_cast(allocations[MEM_OUT_SYMBOLS]); - - // Copy out symbols - error = cudaMemcpyAsync(kernel_param.d_trans_offsets, - out_symbol_offsets.data(), - out_symbol_offsets.size() * sizeof(out_symbol_offsets[0]), - cudaMemcpyHostToDevice, - stream); - if (error) { return error; } - - // Copy offsets into output symbols - return cudaMemcpyAsync(kernel_param.d_out_symbols, - out_symbols.data(), - out_symbols.size() * sizeof(out_symbols[0]), - cudaMemcpyHostToDevice, - stream); + if (out_symbols.size() > MAX_TABLE_SIZE) { CUDF_FAIL("Unsupported translation table"); } + + // Prepare host-side data to be copied and passed to the device + std::copy(std::cbegin(out_symbol_offsets), + std::cend(out_symbol_offsets), + translation_table_init.host_ptr()->d_out_offsets); + std::copy(std::cbegin(out_symbols), + std::cend(out_symbols), + translation_table_init.host_ptr()->d_out_symbols); + + // Copy data to device + translation_table_init.host_to_device(stream); } - //------------------------------------------------------------------------------ - // MEMBER VARIABLES - //------------------------------------------------------------------------------ + private: _TempStorage& temp_storage; __device__ __forceinline__ _TempStorage& PrivateStorage() @@ -147,17 +120,19 @@ struct TransducerLookupTable { return private_storage; } - //------------------------------------------------------------------------------ - // CONSTRUCTOR - //------------------------------------------------------------------------------ - __host__ __device__ __forceinline__ TransducerLookupTable(const KernelParameter& kernel_param, - TempStorage& temp_storage) + public: + /** + * @brief Synchronizes the thread block, if called from device, and, hence, requires all threads + * of the thread block to call the constructor + */ + CUDF_HOST_DEVICE TransducerLookupTable(KernelParameter const& kernel_param, + TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) { constexpr uint32_t num_offsets = MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1; #if CUB_PTX_ARCH > 0 for (int i = threadIdx.x; i < num_offsets; i += blockDim.x) { - this->temp_storage.out_offset[i] = kernel_param.d_trans_offsets[i]; + this->temp_storage.out_offset[i] = kernel_param.d_out_offsets[i]; } // Make sure all threads in the block can read out_symbol_offsets[num_offsets - 1] from shared // memory @@ -168,7 +143,7 @@ struct TransducerLookupTable { __syncthreads(); #else for (int i = 0; i < num_offsets; i++) { - this->temp_storage.out_symbol_offsets[i] = kernel_param.d_trans_offsets[i]; + this->temp_storage.out_symbol_offsets[i] = kernel_param.d_out_offsets[i]; } for (int i = 0; i < this->temp_storage.out_symbol_offsets[i]; i++) { this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; @@ -177,17 +152,17 @@ struct TransducerLookupTable { } template - __host__ __device__ __forceinline__ OutSymbolT operator()(StateIndexT state_id, - SymbolIndexT match_id, - RelativeOffsetT relative_offset) const + constexpr CUDF_HOST_DEVICE OutSymbolT operator()(StateIndexT const state_id, + SymbolIndexT const match_id, + RelativeOffsetT const relative_offset) const { auto offset = temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id] + relative_offset; return temp_storage.out_symbols[offset]; } template - __host__ __device__ __forceinline__ OutSymbolOffsetT operator()(StateIndexT state_id, - SymbolIndexT match_id) const + constexpr CUDF_HOST_DEVICE OutSymbolOffsetT operator()(StateIndexT const state_id, + SymbolIndexT const match_id) const { return temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id + 1] - temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id]; diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 26bb9d47dca..29c93a6f3bb 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -224,10 +224,7 @@ TEST_F(FstTest, GroundTruth) d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); // Run algorithm - DfaFstT parser; - - // Initialize DFA - ASSERT_CUDA_SUCCEEDED(parser.Init(pda_sgs, pda_state_tt, pda_out_tt, stream)); + DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream}; std::size_t temp_storage_bytes = 0; From e439320151e1dfc451aaf35a2fc7d42440d897fa Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 25 Apr 2022 12:17:08 -0700 Subject: [PATCH 017/173] put lookup tables into their own cudf file --- cpp/src/io/fst/device_dfa.cuh | 152 -------- cpp/src/io/fst/lookup_tables.cuh | 519 +++++++++++++++++++++++++++ cpp/src/io/fst/symbol_lut.cuh | 158 -------- cpp/src/io/fst/transition_table.cuh | 110 ------ cpp/src/io/fst/translation_table.cuh | 175 --------- cpp/tests/io/fst/fst_test.cu | 4 +- 6 files changed, 521 insertions(+), 597 deletions(-) create mode 100644 cpp/src/io/fst/lookup_tables.cuh delete mode 100644 cpp/src/io/fst/symbol_lut.cuh delete mode 100644 cpp/src/io/fst/transition_table.cuh delete mode 100644 cpp/src/io/fst/translation_table.cuh diff --git a/cpp/src/io/fst/device_dfa.cuh b/cpp/src/io/fst/device_dfa.cuh index b12283a9673..d3f0e8be213 100644 --- a/cpp/src/io/fst/device_dfa.cuh +++ b/cpp/src/io/fst/device_dfa.cuh @@ -18,9 +18,6 @@ #include "dispatch_dfa.cuh" #include -#include -#include -#include #include @@ -96,155 +93,6 @@ cudaError_t DeviceTransduce(void* d_temp_storage, stream); } -template -class dfa_device_view { - private: - using sgid_lut_init_t = typename SymbolGroupIdLookupT::KernelParameter; - using transition_table_init_t = typename TransitionTableT::KernelParameter; - using translation_table_init_t = typename TranslationTableT::KernelParameter; - - public: - // The maximum number of states supported by this DFA instance - // This is a value queried by the DFA simulation algorithm - static constexpr int32_t MAX_NUM_STATES = NUM_STATES; - - //--------------------------------------------------------------------- - // DEVICE-SIDE MEMBER FUNCTIONS - //--------------------------------------------------------------------- - using SymbolGroupStorageT = typename SymbolGroupIdLookupT::TempStorage; - using TransitionTableStorageT = typename TransitionTableT::TempStorage; - using TranslationTableStorageT = typename TranslationTableT::TempStorage; - - __device__ auto InitSymbolGroupLUT(SymbolGroupStorageT& temp_storage) - { - return SymbolGroupIdLookupT(*d_sgid_lut_init, temp_storage); - } - - __device__ auto InitTransitionTable(TransitionTableStorageT& temp_storage) - { - return TransitionTableT(*d_transition_table_init, temp_storage); - } - - __device__ auto InitTranslationTable(TranslationTableStorageT& temp_storage) - { - return TranslationTableT(*d_translation_table_init, temp_storage); - } - - dfa_device_view(sgid_lut_init_t const* d_sgid_lut_init, - transition_table_init_t const* d_transition_table_init, - translation_table_init_t const* d_translation_table_init) - : d_sgid_lut_init(d_sgid_lut_init), - d_transition_table_init(d_transition_table_init), - d_translation_table_init(d_translation_table_init) - { - } - - private: - sgid_lut_init_t const* d_sgid_lut_init; - transition_table_init_t const* d_transition_table_init; - translation_table_init_t const* d_translation_table_init; -}; - -/** - * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the - * transition table and its number of states, the mapping of symbols to symbol groups, and the - * translation table that specifies which state transitions cause which output to be written). - * - * @tparam OutSymbolT The symbol type being output by the finite-state transducer - * @tparam NUM_SYMBOLS The number of symbol groups amongst which to differentiate (one dimension of - * the transition table) - * @tparam NUM_STATES The number of states defined by the DFA (the other dimension of the - * transition table) - */ -template -class Dfa { - public: - // The maximum number of states supported by this DFA instance - // This is a value queried by the DFA simulation algorithm - static constexpr int32_t MAX_NUM_STATES = NUM_STATES; - - private: - // Symbol-group id lookup table - using SymbolGroupIdLookupT = detail::SingleSymbolSmemLUT; - using SymbolGroupIdInitT = typename SymbolGroupIdLookupT::KernelParameter; - - // Transition table - using TransitionTableT = detail::TransitionTable; - using TransitionTableInitT = typename TransitionTableT::KernelParameter; - - // Translation lookup table - using OutSymbolOffsetT = uint32_t; - using TranslationTableT = detail::TransducerLookupTable; - using TranslationTableInitT = typename TranslationTableT::KernelParameter; - - auto get_device_view() - { - return dfa_device_view{ - sgid_init.d_begin(), transition_table_init.d_begin(), translation_table_init.d_begin()}; - } - - public: - template - Dfa(SymbolGroupIdItT const& symbol_vec, - std::vector> const& tt_vec, - std::vector>> const& out_tt_vec, - cudaStream_t stream) - { - constexpr std::size_t single_item = 1; - - sgid_init = hostdevice_vector{single_item, stream}; - transition_table_init = hostdevice_vector{single_item, stream}; - translation_table_init = hostdevice_vector{single_item, stream}; - - // Initialize symbol group id lookup table - SymbolGroupIdLookupT::InitDeviceSymbolGroupIdLut(sgid_init, symbol_vec, stream); - - // Initialize state transition table - TransitionTableT::InitDeviceTransitionTable(transition_table_init, tt_vec, stream); - - // Initialize finite-state transducer lookup table - TranslationTableT::InitDeviceTranslationTable(translation_table_init, out_tt_vec, stream); - } - - template - cudaError_t Transduce(void* d_temp_storage, - size_t& temp_storage_bytes, - SymbolT const* d_chars, - OffsetT num_chars, - TransducedOutItT d_out_it, - TransducedIndexOutItT d_out_idx_it, - TransducedCountOutItT d_num_transduced_out_it, - const uint32_t seed_state = 0, - cudaStream_t stream = 0) - { - return DeviceTransduce(d_temp_storage, - temp_storage_bytes, - this->get_device_view(), - d_chars, - num_chars, - d_out_it, - d_out_idx_it, - d_num_transduced_out_it, - seed_state, - stream); - } - - private: - hostdevice_vector sgid_init{}; - hostdevice_vector transition_table_init{}; - hostdevice_vector translation_table_init{}; -}; } // namespace fst } // namespace io } // namespace cudf diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh new file mode 100644 index 00000000000..58853919b69 --- /dev/null +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -0,0 +1,519 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +#include +#include +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { + +/** + * @brief Class template that can be plugged into the finite-state machine to look up the symbol + * group index for a given symbol. Class template does not support multi-symbol lookups (i.e., no + * look-ahead). + * + * @tparam SymbolT The symbol type being passed in to lookup the corresponding symbol group id + */ +template +class SingleSymbolSmemLUT { + private: + // Type used for representing a symbol group id (i.e., what we return for a given symbol) + using SymbolGroupIdT = uint8_t; + + /// Number of entries for every lookup (e.g., for 8-bit Symbol this is 256) + static constexpr uint32_t NUM_ENTRIES_PER_LUT = 0x01U << (sizeof(SymbolT) * 8U); + + struct _TempStorage { + // sym_to_sgid[symbol] -> symbol group index + SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; + }; + + public: + struct KernelParameter { + // sym_to_sgid[min(symbol,num_valid_entries)] -> symbol group index + SymbolT num_valid_entries; + + // sym_to_sgid[symbol] -> symbol group index + SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; + }; + + using TempStorage = cub::Uninitialized<_TempStorage>; + + //------------------------------------------------------------------------------ + // HELPER METHODS + //------------------------------------------------------------------------------ + /** + * @brief + * + * @param[out] sgid_init A hostdevice_vector that will be populated + * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols + * (characters!) that correspond to the i-th symbol group index + * @param[in] stream The stream that shall be used to cudaMemcpyAsync the lookup table + * @return + */ + template + static void InitDeviceSymbolGroupIdLut(hostdevice_vector& sgid_init, + SymbolGroupItT const& symbol_strings, + rmm::cuda_stream_view stream) + { + // The symbol group index to be returned if none of the given symbols match + SymbolGroupIdT no_match_id = symbol_strings.size(); + + // The symbol with the largest value that is mapped to a symbol group id + SymbolGroupIdT max_base_match_val = 0; + + // Initialize all entries: by default we return the no-match-id + std::fill(&sgid_init.host_ptr()->sym_to_sgid[0], + &sgid_init.host_ptr()->sym_to_sgid[NUM_ENTRIES_PER_LUT], + no_match_id); + + // Set up lookup table + uint32_t sg_id = 0; + // Iterate over the symbol groups + for (auto const& sg_symbols : symbol_strings) { + // Iterate over all symbols that belong to the current symbol group + for (auto const& sg_symbol : sg_symbols) { + max_base_match_val = std::max(max_base_match_val, static_cast(sg_symbol)); + sgid_init.host_ptr()->sym_to_sgid[static_cast(sg_symbol)] = sg_id; + } + sg_id++; + } + + // Initialize the out-of-bounds lookup: sym_to_sgid[max_base_match_val+1] -> no_match_id + sgid_init.host_ptr()->sym_to_sgid[max_base_match_val + 1] = no_match_id; + + // Alias memory / return memory requiremenets + // TODO I think this could be +1? + sgid_init.host_ptr()->num_valid_entries = max_base_match_val + 2; + + sgid_init.host_to_device(stream); + } + + //------------------------------------------------------------------------------ + // MEMBER VARIABLES + //------------------------------------------------------------------------------ + _TempStorage& temp_storage; + SymbolGroupIdT num_valid_entries; + + //------------------------------------------------------------------------------ + // CONSTRUCTOR + //------------------------------------------------------------------------------ + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + constexpr CUDF_HOST_DEVICE SingleSymbolSmemLUT(KernelParameter const& kernel_param, + TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()), num_valid_entries(kernel_param.num_valid_entries) + { + // GPU-side init +#if CUB_PTX_ARCH > 0 + for (int32_t i = threadIdx.x; i < kernel_param.num_valid_entries; i += blockDim.x) { + this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i]; + } + __syncthreads(); + +#else + // CPU-side init + for (std::size_t i = 0; i < kernel_param.num_luts; i++) { + this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i]; + } +#endif + } + + constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT const symbol) const + { + // Look up the symbol group for given symbol + return temp_storage.sym_to_sgid[min(symbol, num_valid_entries - 1)]; + } +}; + +template +class TransitionTable { + private: + // Type used + using ItemT = char; + + struct _TempStorage { + ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; + }; + + public: + using TempStorage = cub::Uninitialized<_TempStorage>; + + struct KernelParameter { + ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; + }; + + static void InitDeviceTransitionTable(hostdevice_vector& transition_table_init, + const std::vector>& trans_table, + rmm::cuda_stream_view stream) + { + // trans_table[state][symbol] -> new state + for (std::size_t state = 0; state < trans_table.size(); ++state) { + for (std::size_t symbol = 0; symbol < trans_table[state].size(); ++symbol) { + transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] = + trans_table[state][symbol]; + } + } + + // Copy transition table to device + transition_table_init.host_to_device(stream); + } + + constexpr CUDF_HOST_DEVICE TransitionTable(const KernelParameter& kernel_param, + TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()) + { +#if CUB_PTX_ARCH > 0 + for (int i = threadIdx.x; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i += blockDim.x) { + this->temp_storage.transitions[i] = kernel_param.transitions[i]; + } + __syncthreads(); +#else + for (int i = 0; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i++) { + this->temp_storage.transitions[i] = kernel_param.transitions[i]; + } +#endif + } + + /** + * @brief Returns a random-access iterator to lookup all the state transitions for one specific + * symbol from an arbitrary old_state, i.e., it[old_state] -> new_state. + * + * @param state_id The DFA's current state index from which we'll transition + * @param match_id The symbol group id of the symbol that we just read in + * @return + */ + template + constexpr CUDF_HOST_DEVICE int32_t operator()(StateIndexT const state_id, + SymbolIndexT const match_id) const + { + return temp_storage.transitions[match_id * MAX_NUM_STATES + state_id]; + } + + private: + _TempStorage& temp_storage; + + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + + return private_storage; + } +}; + +template +class dfa_device_view { + private: + using sgid_lut_init_t = typename SymbolGroupIdLookupT::KernelParameter; + using transition_table_init_t = typename TransitionTableT::KernelParameter; + using translation_table_init_t = typename TranslationTableT::KernelParameter; + + public: + // The maximum number of states supported by this DFA instance + // This is a value queried by the DFA simulation algorithm + static constexpr int32_t MAX_NUM_STATES = NUM_STATES; + + using SymbolGroupStorageT = typename SymbolGroupIdLookupT::TempStorage; + using TransitionTableStorageT = typename TransitionTableT::TempStorage; + using TranslationTableStorageT = typename TranslationTableT::TempStorage; + + __device__ auto InitSymbolGroupLUT(SymbolGroupStorageT& temp_storage) + { + return SymbolGroupIdLookupT(*d_sgid_lut_init, temp_storage); + } + + __device__ auto InitTransitionTable(TransitionTableStorageT& temp_storage) + { + return TransitionTableT(*d_transition_table_init, temp_storage); + } + + __device__ auto InitTranslationTable(TranslationTableStorageT& temp_storage) + { + return TranslationTableT(*d_translation_table_init, temp_storage); + } + + dfa_device_view(sgid_lut_init_t const* d_sgid_lut_init, + transition_table_init_t const* d_transition_table_init, + translation_table_init_t const* d_translation_table_init) + : d_sgid_lut_init(d_sgid_lut_init), + d_transition_table_init(d_transition_table_init), + d_translation_table_init(d_translation_table_init) + { + } + + private: + sgid_lut_init_t const* d_sgid_lut_init; + transition_table_init_t const* d_transition_table_init; + translation_table_init_t const* d_translation_table_init; +}; + +/** + * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols + * that the finite-state transducer is supposed to output for each transition + * + * @tparam OutSymbolT The symbol type being output + * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols + * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition + * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support + * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols + */ +template +class TransducerLookupTable { + private: + struct _TempStorage { + OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; + OutSymbolT out_symbols[MAX_TABLE_SIZE]; + }; + + public: + using TempStorage = cub::Uninitialized<_TempStorage>; + + struct KernelParameter { + OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; + OutSymbolT d_out_symbols[MAX_TABLE_SIZE]; + }; + + /** + * @brief Initializes the translation table (both the host and device parts) + */ + static void InitDeviceTranslationTable( + hostdevice_vector& translation_table_init, + std::vector>> const& trans_table, + rmm::cuda_stream_view stream) + { + std::vector out_symbols; + out_symbols.reserve(MAX_TABLE_SIZE); + std::vector out_symbol_offsets; + out_symbol_offsets.reserve(MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1); + out_symbol_offsets.push_back(0); + + // Iterate over the states in the transition table + for (auto const& state_trans : trans_table) { + uint32_t num_added = 0; + // Iterate over the symbols in the transition table + for (auto const& symbol_out : state_trans) { + // Insert the output symbols for this specific (state, symbol) transition + out_symbols.insert(std::end(out_symbols), std::begin(symbol_out), std::end(symbol_out)); + out_symbol_offsets.push_back(out_symbols.size()); + num_added++; + } + + // Copy the last offset for all symbols (to guarantee a proper lookup for omitted symbols of + // this state) + if (MAX_NUM_SYMBOLS > num_added) { + int32_t count = MAX_NUM_SYMBOLS - num_added; + auto begin_it = std::prev(std::end(out_symbol_offsets)); + std::copy(begin_it, begin_it + count, std::back_inserter(out_symbol_offsets)); + } + } + + // Check whether runtime-provided table size exceeds the compile-time given max. table size + if (out_symbols.size() > MAX_TABLE_SIZE) { CUDF_FAIL("Unsupported translation table"); } + + // Prepare host-side data to be copied and passed to the device + std::copy(std::cbegin(out_symbol_offsets), + std::cend(out_symbol_offsets), + translation_table_init.host_ptr()->d_out_offsets); + std::copy(std::cbegin(out_symbols), + std::cend(out_symbols), + translation_table_init.host_ptr()->d_out_symbols); + + // Copy data to device + translation_table_init.host_to_device(stream); + } + + private: + _TempStorage& temp_storage; + + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + public: + /** + * @brief Synchronizes the thread block, if called from device, and, hence, requires all threads + * of the thread block to call the constructor + */ + CUDF_HOST_DEVICE TransducerLookupTable(KernelParameter const& kernel_param, + TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()) + { + constexpr uint32_t num_offsets = MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1; +#if CUB_PTX_ARCH > 0 + for (int i = threadIdx.x; i < num_offsets; i += blockDim.x) { + this->temp_storage.out_offset[i] = kernel_param.d_out_offsets[i]; + } + // Make sure all threads in the block can read out_symbol_offsets[num_offsets - 1] from shared + // memory + __syncthreads(); + for (int i = threadIdx.x; i < this->temp_storage.out_offset[num_offsets - 1]; i += blockDim.x) { + this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; + } + __syncthreads(); +#else + for (int i = 0; i < num_offsets; i++) { + this->temp_storage.out_symbol_offsets[i] = kernel_param.d_out_offsets[i]; + } + for (int i = 0; i < this->temp_storage.out_symbol_offsets[i]; i++) { + this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; + } +#endif + } + + template + constexpr CUDF_HOST_DEVICE OutSymbolT operator()(StateIndexT const state_id, + SymbolIndexT const match_id, + RelativeOffsetT const relative_offset) const + { + auto offset = temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id] + relative_offset; + return temp_storage.out_symbols[offset]; + } + + template + constexpr CUDF_HOST_DEVICE OutSymbolOffsetT operator()(StateIndexT const state_id, + SymbolIndexT const match_id) const + { + return temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id + 1] - + temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id]; + } +}; + +/** + * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the + * transition table and its number of states, the mapping of symbols to symbol groups, and the + * translation table that specifies which state transitions cause which output to be written). + * + * @tparam OutSymbolT The symbol type being output by the finite-state transducer + * @tparam NUM_SYMBOLS The number of symbol groups amongst which to differentiate (one dimension of + * the transition table) + * @tparam NUM_STATES The number of states defined by the DFA (the other dimension of the + * transition table) + */ +template +class Dfa { + public: + // The maximum number of states supported by this DFA instance + // This is a value queried by the DFA simulation algorithm + static constexpr int32_t MAX_NUM_STATES = NUM_STATES; + + private: + // Symbol-group id lookup table + using SymbolGroupIdLookupT = detail::SingleSymbolSmemLUT; + using SymbolGroupIdInitT = typename SymbolGroupIdLookupT::KernelParameter; + + // Transition table + using TransitionTableT = detail::TransitionTable; + using TransitionTableInitT = typename TransitionTableT::KernelParameter; + + // Translation lookup table + using OutSymbolOffsetT = uint32_t; + using TranslationTableT = detail::TransducerLookupTable; + using TranslationTableInitT = typename TranslationTableT::KernelParameter; + + auto get_device_view() + { + return dfa_device_view{ + sgid_init.d_begin(), transition_table_init.d_begin(), translation_table_init.d_begin()}; + } + + public: + template + Dfa(SymbolGroupIdItT const& symbol_vec, + std::vector> const& tt_vec, + std::vector>> const& out_tt_vec, + cudaStream_t stream) + { + constexpr std::size_t single_item = 1; + + sgid_init = hostdevice_vector{single_item, stream}; + transition_table_init = hostdevice_vector{single_item, stream}; + translation_table_init = hostdevice_vector{single_item, stream}; + + // Initialize symbol group id lookup table + SymbolGroupIdLookupT::InitDeviceSymbolGroupIdLut(sgid_init, symbol_vec, stream); + + // Initialize state transition table + TransitionTableT::InitDeviceTransitionTable(transition_table_init, tt_vec, stream); + + // Initialize finite-state transducer lookup table + TranslationTableT::InitDeviceTranslationTable(translation_table_init, out_tt_vec, stream); + } + + template + cudaError_t Transduce(void* d_temp_storage, + size_t& temp_storage_bytes, + SymbolT const* d_chars, + OffsetT num_chars, + TransducedOutItT d_out_it, + TransducedIndexOutItT d_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it, + const uint32_t seed_state = 0, + cudaStream_t stream = 0) + { + return DeviceTransduce(d_temp_storage, + temp_storage_bytes, + this->get_device_view(), + d_chars, + num_chars, + d_out_it, + d_out_idx_it, + d_num_transduced_out_it, + seed_state, + stream); + } + + private: + hostdevice_vector sgid_init{}; + hostdevice_vector transition_table_init{}; + hostdevice_vector translation_table_init{}; +}; + +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/fst/symbol_lut.cuh b/cpp/src/io/fst/symbol_lut.cuh deleted file mode 100644 index abf71a7fbea..00000000000 --- a/cpp/src/io/fst/symbol_lut.cuh +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include - -#include -#include -#include - -namespace cudf { -namespace io { -namespace fst { -namespace detail { -/** - * @brief Class template that can be plugged into the finite-state machine to look up the symbol - * group index for a given symbol. Class template does not support multi-symbol lookups (i.e., no - * look-ahead). - * - * @tparam SymbolT The symbol type being passed in to lookup the corresponding symbol group id - */ -template -class SingleSymbolSmemLUT { - private: - // Type used for representing a symbol group id (i.e., what we return for a given symbol) - using SymbolGroupIdT = uint8_t; - - /// Number of entries for every lookup (e.g., for 8-bit Symbol this is 256) - static constexpr uint32_t NUM_ENTRIES_PER_LUT = 0x01U << (sizeof(SymbolT) * 8U); - - struct _TempStorage { - // sym_to_sgid[symbol] -> symbol group index - SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; - }; - - public: - struct KernelParameter { - // sym_to_sgid[min(symbol,num_valid_entries)] -> symbol group index - SymbolT num_valid_entries; - - // sym_to_sgid[symbol] -> symbol group index - SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; - }; - - using TempStorage = cub::Uninitialized<_TempStorage>; - - //------------------------------------------------------------------------------ - // HELPER METHODS - //------------------------------------------------------------------------------ - /** - * @brief - * - * @param[out] sgid_init A hostdevice_vector that will be populated - * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols - * (characters!) that correspond to the i-th symbol group index - * @param[in] stream The stream that shall be used to cudaMemcpyAsync the lookup table - * @return - */ - template - static void InitDeviceSymbolGroupIdLut(hostdevice_vector& sgid_init, - SymbolGroupItT const& symbol_strings, - rmm::cuda_stream_view stream) - { - // The symbol group index to be returned if none of the given symbols match - SymbolGroupIdT no_match_id = symbol_strings.size(); - - // The symbol with the largest value that is mapped to a symbol group id - SymbolGroupIdT max_base_match_val = 0; - - // Initialize all entries: by default we return the no-match-id - std::fill(&sgid_init.host_ptr()->sym_to_sgid[0], - &sgid_init.host_ptr()->sym_to_sgid[NUM_ENTRIES_PER_LUT], - no_match_id); - - // Set up lookup table - uint32_t sg_id = 0; - // Iterate over the symbol groups - for (auto const& sg_symbols : symbol_strings) { - // Iterate over all symbols that belong to the current symbol group - for (auto const& sg_symbol : sg_symbols) { - max_base_match_val = std::max(max_base_match_val, static_cast(sg_symbol)); - sgid_init.host_ptr()->sym_to_sgid[static_cast(sg_symbol)] = sg_id; - } - sg_id++; - } - - // Initialize the out-of-bounds lookup: sym_to_sgid[max_base_match_val+1] -> no_match_id - sgid_init.host_ptr()->sym_to_sgid[max_base_match_val + 1] = no_match_id; - - // Alias memory / return memory requiremenets - // TODO I think this could be +1? - sgid_init.host_ptr()->num_valid_entries = max_base_match_val + 2; - - sgid_init.host_to_device(stream); - } - - //------------------------------------------------------------------------------ - // MEMBER VARIABLES - //------------------------------------------------------------------------------ - _TempStorage& temp_storage; - SymbolGroupIdT num_valid_entries; - - //------------------------------------------------------------------------------ - // CONSTRUCTOR - //------------------------------------------------------------------------------ - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - constexpr CUDF_HOST_DEVICE SingleSymbolSmemLUT(KernelParameter const& kernel_param, - TempStorage& temp_storage) - : temp_storage(temp_storage.Alias()), num_valid_entries(kernel_param.num_valid_entries) - { - // GPU-side init -#if CUB_PTX_ARCH > 0 - for (int32_t i = threadIdx.x; i < kernel_param.num_valid_entries; i += blockDim.x) { - this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i]; - } - __syncthreads(); - -#else - // CPU-side init - for (std::size_t i = 0; i < kernel_param.num_luts; i++) { - this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i]; - } -#endif - } - - constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT const symbol) const - { - // Look up the symbol group for given symbol - return temp_storage.sym_to_sgid[min(symbol, num_valid_entries - 1)]; - } -}; - -} // namespace detail -} // namespace fst -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/fst/transition_table.cuh b/cpp/src/io/fst/transition_table.cuh deleted file mode 100644 index 5eccb926974..00000000000 --- a/cpp/src/io/fst/transition_table.cuh +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include - -#include - -namespace cudf { -namespace io { -namespace fst { -namespace detail { - -template -class TransitionTable { - private: - // Type used - using ItemT = char; - - struct _TempStorage { - ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; - }; - - public: - using TempStorage = cub::Uninitialized<_TempStorage>; - - struct KernelParameter { - ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; - }; - - static void InitDeviceTransitionTable(hostdevice_vector& transition_table_init, - const std::vector>& trans_table, - rmm::cuda_stream_view stream) - { - // trans_table[state][symbol] -> new state - for (std::size_t state = 0; state < trans_table.size(); ++state) { - for (std::size_t symbol = 0; symbol < trans_table[state].size(); ++symbol) { - transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] = - trans_table[state][symbol]; - } - } - - // Copy transition table to device - transition_table_init.host_to_device(stream); - } - - constexpr CUDF_HOST_DEVICE TransitionTable(const KernelParameter& kernel_param, - TempStorage& temp_storage) - : temp_storage(temp_storage.Alias()) - { -#if CUB_PTX_ARCH > 0 - for (int i = threadIdx.x; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i += blockDim.x) { - this->temp_storage.transitions[i] = kernel_param.transitions[i]; - } - __syncthreads(); -#else - for (int i = 0; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i++) { - this->temp_storage.transitions[i] = kernel_param.transitions[i]; - } -#endif - } - - /** - * @brief Returns a random-access iterator to lookup all the state transitions for one specific - * symbol from an arbitrary old_state, i.e., it[old_state] -> new_state. - * - * @param state_id The DFA's current state index from which we'll transition - * @param match_id The symbol group id of the symbol that we just read in - * @return - */ - template - constexpr CUDF_HOST_DEVICE int32_t operator()(StateIndexT const state_id, - SymbolIndexT const match_id) const - { - return temp_storage.transitions[match_id * MAX_NUM_STATES + state_id]; - } - - private: - _TempStorage& temp_storage; - - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - - return private_storage; - } -}; - -} // namespace detail -} // namespace fst -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/fst/translation_table.cuh b/cpp/src/io/fst/translation_table.cuh deleted file mode 100644 index 89da994606c..00000000000 --- a/cpp/src/io/fst/translation_table.cuh +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include "rmm/device_uvector.hpp" - -#include - -#include - -namespace cudf { -namespace io { -namespace fst { -namespace detail { - -/** - * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols - * that the finite-state transducer is supposed to output for each transition - * - * @tparam OutSymbolT The symbol type being output - * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols - * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition - * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support - * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols - */ -template -class TransducerLookupTable { - private: - struct _TempStorage { - OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; - OutSymbolT out_symbols[MAX_TABLE_SIZE]; - }; - - public: - using TempStorage = cub::Uninitialized<_TempStorage>; - - struct KernelParameter { - OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; - OutSymbolT d_out_symbols[MAX_TABLE_SIZE]; - }; - - /** - * @brief Initializes the translation table (both the host and device parts) - */ - static void InitDeviceTranslationTable( - hostdevice_vector& translation_table_init, - std::vector>> const& trans_table, - rmm::cuda_stream_view stream) - { - std::vector out_symbols; - out_symbols.reserve(MAX_TABLE_SIZE); - std::vector out_symbol_offsets; - out_symbol_offsets.reserve(MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1); - out_symbol_offsets.push_back(0); - - // Iterate over the states in the transition table - for (auto const& state_trans : trans_table) { - uint32_t num_added = 0; - // Iterate over the symbols in the transition table - for (auto const& symbol_out : state_trans) { - // Insert the output symbols for this specific (state, symbol) transition - out_symbols.insert(std::end(out_symbols), std::begin(symbol_out), std::end(symbol_out)); - out_symbol_offsets.push_back(out_symbols.size()); - num_added++; - } - - // Copy the last offset for all symbols (to guarantee a proper lookup for omitted symbols of - // this state) - if (MAX_NUM_SYMBOLS > num_added) { - int32_t count = MAX_NUM_SYMBOLS - num_added; - auto begin_it = std::prev(std::end(out_symbol_offsets)); - std::copy(begin_it, begin_it + count, std::back_inserter(out_symbol_offsets)); - } - } - - // Check whether runtime-provided table size exceeds the compile-time given max. table size - if (out_symbols.size() > MAX_TABLE_SIZE) { CUDF_FAIL("Unsupported translation table"); } - - // Prepare host-side data to be copied and passed to the device - std::copy(std::cbegin(out_symbol_offsets), - std::cend(out_symbol_offsets), - translation_table_init.host_ptr()->d_out_offsets); - std::copy(std::cbegin(out_symbols), - std::cend(out_symbols), - translation_table_init.host_ptr()->d_out_symbols); - - // Copy data to device - translation_table_init.host_to_device(stream); - } - - private: - _TempStorage& temp_storage; - - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - public: - /** - * @brief Synchronizes the thread block, if called from device, and, hence, requires all threads - * of the thread block to call the constructor - */ - CUDF_HOST_DEVICE TransducerLookupTable(KernelParameter const& kernel_param, - TempStorage& temp_storage) - : temp_storage(temp_storage.Alias()) - { - constexpr uint32_t num_offsets = MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1; -#if CUB_PTX_ARCH > 0 - for (int i = threadIdx.x; i < num_offsets; i += blockDim.x) { - this->temp_storage.out_offset[i] = kernel_param.d_out_offsets[i]; - } - // Make sure all threads in the block can read out_symbol_offsets[num_offsets - 1] from shared - // memory - __syncthreads(); - for (int i = threadIdx.x; i < this->temp_storage.out_offset[num_offsets - 1]; i += blockDim.x) { - this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; - } - __syncthreads(); -#else - for (int i = 0; i < num_offsets; i++) { - this->temp_storage.out_symbol_offsets[i] = kernel_param.d_out_offsets[i]; - } - for (int i = 0; i < this->temp_storage.out_symbol_offsets[i]; i++) { - this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i]; - } -#endif - } - - template - constexpr CUDF_HOST_DEVICE OutSymbolT operator()(StateIndexT const state_id, - SymbolIndexT const match_id, - RelativeOffsetT const relative_offset) const - { - auto offset = temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id] + relative_offset; - return temp_storage.out_symbols[offset]; - } - - template - constexpr CUDF_HOST_DEVICE OutSymbolOffsetT operator()(StateIndexT const state_id, - SymbolIndexT const match_id) const - { - return temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id + 1] - - temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id]; - } -}; - -} // namespace detail -} // namespace fst -} // namespace io -} // namespace cudf diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 29c93a6f3bb..012c37ab842 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include @@ -189,7 +189,7 @@ TEST_F(FstTest, GroundTruth) using SymbolOffsetT = uint32_t; // Helper class to set up transition table, symbol group lookup table, and translation table - using DfaFstT = cudf::io::fst::Dfa; + using DfaFstT = cudf::io::fst::detail::Dfa; // Prepare cuda stream for data transfers & kernels cudaStream_t stream = nullptr; From 05840b32d3dc4a5ed37cc4d944b28b568bb0652e Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 27 Apr 2022 04:42:31 -0700 Subject: [PATCH 018/173] Change interface for FST to not need temp storage --- cpp/src/io/fst/lookup_tables.cuh | 55 ++++++++++++++++++++------------ cpp/tests/io/fst/fst_test.cu | 39 ++++++++-------------- 2 files changed, 49 insertions(+), 45 deletions(-) diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 58853919b69..f8921d4091b 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -17,8 +17,8 @@ #pragma once #include -#include #include +#include #include @@ -485,26 +485,41 @@ class Dfa { typename TransducedIndexOutItT, typename TransducedCountOutItT, typename OffsetT> - cudaError_t Transduce(void* d_temp_storage, - size_t& temp_storage_bytes, - SymbolT const* d_chars, - OffsetT num_chars, - TransducedOutItT d_out_it, - TransducedIndexOutItT d_out_idx_it, - TransducedCountOutItT d_num_transduced_out_it, - const uint32_t seed_state = 0, - cudaStream_t stream = 0) + void Transduce(SymbolT const* d_chars, + OffsetT num_chars, + TransducedOutItT d_out_it, + TransducedIndexOutItT d_out_idx_it, + TransducedCountOutItT d_num_transduced_out_it, + const uint32_t seed_state, + rmm::cuda_stream_view stream) { - return DeviceTransduce(d_temp_storage, - temp_storage_bytes, - this->get_device_view(), - d_chars, - num_chars, - d_out_it, - d_out_idx_it, - d_num_transduced_out_it, - seed_state, - stream); + std::size_t temp_storage_bytes = 0; + rmm::device_buffer temp_storage{}; + DeviceTransduce(nullptr, + temp_storage_bytes, + this->get_device_view(), + d_chars, + num_chars, + d_out_it, + d_out_idx_it, + d_num_transduced_out_it, + seed_state, + stream); + + if (temp_storage.size() < temp_storage_bytes) { + temp_storage.resize(temp_storage_bytes, stream); + } + + DeviceTransduce(temp_storage.data(), + temp_storage_bytes, + this->get_device_view(), + d_chars, + num_chars, + d_out_it, + d_out_idx_it, + d_num_transduced_out_it, + seed_state, + stream); } private: diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 012c37ab842..9e8011bb7df 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -217,8 +217,10 @@ TEST_F(FstTest, GroundTruth) input += input; // Prepare input & output buffers + constexpr std::size_t single_item = 1; rmm::device_uvector d_input(input.size(), stream_view); hostdevice_vector output_gpu(input.size(), stream_view); + hostdevice_vector output_gpu_size(single_item, stream_view); hostdevice_vector out_indexes_gpu(input.size(), stream_view); ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); @@ -228,32 +230,19 @@ TEST_F(FstTest, GroundTruth) std::size_t temp_storage_bytes = 0; - // Query temporary storage requirements - ASSERT_CUDA_SUCCEEDED(parser.Transduce(nullptr, - temp_storage_bytes, - d_input.data(), - static_cast(d_input.size()), - output_gpu.device_ptr(), - out_indexes_gpu.device_ptr(), - cub::DiscardOutputIterator{}, - start_state, - stream)); - // Allocate device-side temporary storage & run algorithm - rmm::device_buffer temp_storage{temp_storage_bytes, stream_view}; - ASSERT_CUDA_SUCCEEDED(parser.Transduce(temp_storage.data(), - temp_storage_bytes, - d_input.data(), - static_cast(d_input.size()), - output_gpu.device_ptr(), - out_indexes_gpu.device_ptr(), - cub::DiscardOutputIterator{}, - start_state, - stream)); + parser.Transduce(d_input.data(), + static_cast(d_input.size()), + output_gpu.device_ptr(), + out_indexes_gpu.device_ptr(), + output_gpu_size.device_ptr(), + start_state, + stream); // Async copy results from device to host output_gpu.device_to_host(stream_view); out_indexes_gpu.device_to_host(stream_view); + output_gpu_size.device_to_host(stream_view); // Prepare CPU-side results for verification std::string output_cpu{}; @@ -275,13 +264,13 @@ TEST_F(FstTest, GroundTruth) cudaStreamSynchronize(stream); // Verify results - ASSERT_EQ(output_gpu.size(), output_cpu.size()); + ASSERT_EQ(output_gpu_size[0], output_cpu.size()); ASSERT_EQ(out_indexes_gpu.size(), out_index_cpu.size()); - for (std::size_t i = 0; i < output_gpu.size(); i++) { - ASSERT_EQ(output_gpu.host_ptr()[i], output_cpu[i]) << "Mismatch at index #" << i; + for (std::size_t i = 0; i < output_cpu.size(); i++) { + ASSERT_EQ(output_gpu[i], output_cpu[i]) << "Mismatch at index #" << i; } for (std::size_t i = 0; i < out_indexes_gpu.size(); i++) { - ASSERT_EQ(out_indexes_gpu.host_ptr()[i], out_index_cpu[i]) << "Mismatch at index #" << i; + ASSERT_EQ(out_indexes_gpu[i], out_index_cpu[i]) << "Mismatch at index #" << i; } } From 6da9360be351c2401a380ef4d07e37275fe7def2 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 4 May 2022 07:29:00 -0700 Subject: [PATCH 019/173] removing unused var post-cleanup --- cpp/tests/io/fst/fst_test.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 9e8011bb7df..3d4f68b03c4 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -228,8 +228,6 @@ TEST_F(FstTest, GroundTruth) // Run algorithm DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream}; - std::size_t temp_storage_bytes = 0; - // Allocate device-side temporary storage & run algorithm parser.Transduce(d_input.data(), static_cast(d_input.size()), From 702dfa16b4a68af4c67ee0877db57ae81c25619d Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 9 May 2022 10:24:51 -0700 Subject: [PATCH 020/173] unified usage of pragma unrolls --- cpp/src/io/fst/in_reg_array.cuh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index ed5948249d4..3180dbfe132 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -59,7 +59,7 @@ class MultiFragmentInRegArray { //------------------------------------------------------------------------------ // HELPER FUNCTIONS //------------------------------------------------------------------------------ - __device__ __host__ __forceinline__ uint32_t bfe(const uint32_t& data, + __host__ __device__ __forceinline__ uint32_t bfe(const uint32_t& data, uint32_t bit_start, uint32_t num_bits) const { @@ -71,7 +71,7 @@ class MultiFragmentInRegArray { #endif } - __device__ __host__ __forceinline__ void bfi(uint32_t& data, + __host__ __device__ __forceinline__ void bfi(uint32_t& data, uint32_t bits, uint32_t bit_start, uint32_t num_bits) const @@ -97,7 +97,6 @@ class MultiFragmentInRegArray { { uint32_t val = 0; - // #pragma unroll for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { val = val | bfe(data[i], index * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM) << (i * BITS_PER_FRAG_ITEM); @@ -107,7 +106,6 @@ class MultiFragmentInRegArray { __host__ __device__ __forceinline__ void Set(uint32_t index, uint32_t value) { - // #pragma unroll for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { uint32_t frag_bits = bfe(value, i * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM); bfi(data[i], frag_bits, index * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM); From 26a39ea41f52a26f82af52021a1c99db355199ec Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 9 May 2022 10:32:17 -0700 Subject: [PATCH 021/173] Adding hostdevice macros to in-reg array --- cpp/src/io/fst/in_reg_array.cuh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index 3180dbfe132..1180dc594da 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -15,6 +15,8 @@ */ #pragma once +#include + #include #include @@ -59,7 +61,7 @@ class MultiFragmentInRegArray { //------------------------------------------------------------------------------ // HELPER FUNCTIONS //------------------------------------------------------------------------------ - __host__ __device__ __forceinline__ uint32_t bfe(const uint32_t& data, + CUDF_HOST_DEVICE uint32_t bfe(const uint32_t& data, uint32_t bit_start, uint32_t num_bits) const { @@ -71,7 +73,7 @@ class MultiFragmentInRegArray { #endif } - __host__ __device__ __forceinline__ void bfi(uint32_t& data, + CUDF_HOST_DEVICE void bfi(uint32_t& data, uint32_t bits, uint32_t bit_start, uint32_t num_bits) const @@ -93,7 +95,7 @@ class MultiFragmentInRegArray { // ACCESSORS //------------------------------------------------------------------------------ public: - __host__ __device__ __forceinline__ uint32_t Get(int32_t index) const + CUDF_HOST_DEVICE uint32_t Get(int32_t index) const { uint32_t val = 0; @@ -104,7 +106,7 @@ class MultiFragmentInRegArray { return val; } - __host__ __device__ __forceinline__ void Set(uint32_t index, uint32_t value) + CUDF_HOST_DEVICE void Set(uint32_t index, uint32_t value) { for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { uint32_t frag_bits = bfe(value, i * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM); @@ -115,14 +117,14 @@ class MultiFragmentInRegArray { //------------------------------------------------------------------------------ // CONSTRUCTORS //------------------------------------------------------------------------------ - __host__ __device__ __forceinline__ MultiFragmentInRegArray() + CUDF_HOST_DEVICE MultiFragmentInRegArray() { for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) { data[i] = 0; } } - __host__ __device__ __forceinline__ MultiFragmentInRegArray(uint32_t const (&array)[NUM_ITEMS]) + CUDF_HOST_DEVICE MultiFragmentInRegArray(uint32_t const (&array)[NUM_ITEMS]) { for (uint32_t i = 0; i < NUM_ITEMS; ++i) { Set(i, array[i]); From 8c685c077bff68cdec6b3b026640996859688267 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 9 May 2022 10:33:00 -0700 Subject: [PATCH 022/173] making const vars const --- cpp/src/io/fst/agent_dfa.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 3bc59160696..aaafd2d7a22 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -104,7 +104,7 @@ class DFASimulationCallbackWrapper { StateVectorT const& new_state, SymbolIndexT const& symbol_id) { - uint32_t count = transducer_table(old_state.Get(0), symbol_id); + uint32_t const count = transducer_table(old_state.Get(0), symbol_id); if (write) { for (uint32_t out_char = 0; out_char < count; out_char++) { out_it[out_count + out_char] = transducer_table(old_state.Get(0), symbol_id, out_char); @@ -117,7 +117,7 @@ class DFASimulationCallbackWrapper { __host__ __device__ __forceinline__ void TearDown() {} public: - TransducerTableT transducer_table; + TransducerTableT const transducer_table; TransducedOutItT out_it; TransducedIndexOutItT out_idx_it; uint32_t out_count; From 5c945211f63a73b077e1f77a7fa5b7e30312039f Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 9 May 2022 12:17:34 -0700 Subject: [PATCH 023/173] refactor lut sanity check --- cpp/src/io/fst/lookup_tables.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index f8921d4091b..3e5504a6208 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -343,7 +343,7 @@ class TransducerLookupTable { } // Check whether runtime-provided table size exceeds the compile-time given max. table size - if (out_symbols.size() > MAX_TABLE_SIZE) { CUDF_FAIL("Unsupported translation table"); } + CUDF_EXPECTS(out_symbols.size() <= MAX_TABLE_SIZE, "Unsupported translation table"); // Prepare host-side data to be copied and passed to the device std::copy(std::cbegin(out_symbol_offsets), From 03b2c208e7529130fd273234a2c44c6abb5feb51 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 3 May 2022 07:05:44 -0700 Subject: [PATCH 024/173] rebase on latest FST --- cpp/CMakeLists.txt | 1 + cpp/src/io/json/nested_json.h | 116 ++++++++ cpp/src/io/json/nested_json_gpu.cu | 410 +++++++++++++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/io/nested_json_test.cu | 189 +++++++++++++ 5 files changed, 717 insertions(+) create mode 100644 cpp/src/io/json/nested_json.h create mode 100644 cpp/src/io/json/nested_json_gpu.cu create mode 100644 cpp/tests/io/nested_json_test.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 42a434ba53d..c690e92467c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -314,6 +314,7 @@ add_library( src/io/csv/writer_impl.cu src/io/functions.cpp src/io/json/json_gpu.cu + src/io/json/nested_json_gpu.cu src/io/json/reader_impl.cu src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu diff --git a/cpp/src/io/json/nested_json.h b/cpp/src/io/json/nested_json.h new file mode 100644 index 00000000000..58f30c7b9ac --- /dev/null +++ b/cpp/src/io/json/nested_json.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { +namespace io { +namespace json { +namespace gpu { + +/// Type used to represent the atomic symbol type used within the finite-state machine +using SymbolT = char; + +/// Type used to represent the stack alphabet (i.e.: empty-stack, struct, list) +using StackSymbolT = char; + +/// Type used to index into the symbols within the JSON input +using SymbolOffsetT = uint32_t; + +/// Type large enough to support indexing up to max nesting level (must be signed) +using StackLevelT = int8_t; + +/// Type used to represent a symbol group id of the input alphabet in the pushdown automaton +using PdaInputSymbolGroupIdT = char; + +/// Type used to represent a symbol group id of the stack alphabet in the pushdown automaton +using PdaStackSymbolGroupIdT = char; + +/// Type used to represent a (input-symbol, stack-symbole)-tuple in stack-symbole-major order +using PdaSymbolGroupIdT = char; + +/// Type being emitted by the pushdown automaton transducer +using PdaTokenT = char; + +/** + * @brief Tokens emitted while parsing a JSON input + */ +enum token_t : PdaTokenT { + /// Beginning-of-struct token (on encounter of semantic '{') + TK_BOS, + /// Beginning-of-list token (on encounter of semantic '[') + TK_BOL, + /// Beginning-of-error token (on first encounter of a parsing error) + TK_ERR, + /// Beginning-of-string-value token (on encounter of the string's first quote) + TK_BST, + /// Beginning-of-value token (first character of literal or numeric) + TK_BOV, + /// End-of-list token (on encounter of semantic ']') + TK_EOL, + /// End-of-struct token (on encounter of semantic '}') + TK_EOS, + /// Beginning-of-field-name token (on encounter of first quote) + TK_BFN, + /// Post-value token (first character after a literal or numeric string) + TK_POV, + /// End-of-string token (on encounter of a string's second quote) + TK_EST, + /// End-of-field-name token (on encounter of a field name's second quote) + TK_EFN, + /// Total number of tokens + NUM_TOKENS +}; + +/** + * @brief Identifies the stack context for each character from a JSON input. Specifically, we + * identify brackets and braces outside of quoted fields (e.g., field names, strings). + * At this stage, we do not perform bracket matching, i.e., we do not verify whether a closing + * bracket would actually pop a the corresponding opening brace. + * + * @param d_json_in The string of input characters + * @param d_top_of_stack + * @param stream The cuda stream to dispatch GPU kernels to + */ +void get_stack_context(device_span d_json_in, + device_span d_top_of_stack, + rmm::cuda_stream_view stream); + +/** + * @brief Parses the given JSON string and emits a sequence of tokens that demarcate relevant + * sections from the input. + * + * @param d_json_in The JSON input + * @param d_tokens_out Device memory to which the parsed tokens are written + * @param d_tokens_indices Device memory to which the indices are written, where each index + * represents the offset within \p d_json_in that cause the input being written + * @param d_num_written_tokens The total number of tokens that were parsed + * @param stream The CUDA stream to which kernels are dispatched + */ +void get_token_stream(device_span d_json_in, + device_span d_tokens, + device_span d_tokens_indices, + SymbolOffsetT* d_num_written_tokens, + rmm::cuda_stream_view stream); + +} // namespace gpu +} // namespace json +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu new file mode 100644 index 00000000000..ae1767bf63a --- /dev/null +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nested_json.h" + +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace io { +namespace json { +namespace gpu { + +//------------------------------------------------------------------------------ +// JSON-TO-STACK-OP DFA +//------------------------------------------------------------------------------ +namespace to_stack_op { + +/** + * @brief Definition of the DFA's states + */ +enum DFA_STATES { + // The state being active while being outside of a string. When encountering an opening bracket + // or curly brace, we push it onto the stack. When encountering a closing bracket or brace, we + // pop from the stack. + TT_OOS = 0U, + + // The state being active while being within a string (e.g., field name or a string value). We do + // not push or pop from the stack while being in this state. + TT_STR, + + // The state being active after encountering an escape symbol (e.g., '\'), while being in the + // TT_STR state. + TT_ESC, + + // Total number of states + TT_NUM_STATES +}; + +/** + * @brief Definition of the symbol groups + */ +enum DFA_SGID { + OBC = 0U, ///< Opening brace SG: { + OBT, ///< Opening bracket SG: [ + CBC, ///< Closing brace SG: } + CBT, ///< Closing bracket SG: ] + QTE, ///< Quote character SG: " + ESC, ///< Escape character SG: '\' + OTR, ///< SG implicitly matching all other characters + NUM_SYMBOL_GROUPS ///< Total number of symbol groups +}; + +// The i-th string representing all the characters of a symbol group +const std::vector symbol_groups = {"{", "[", "}", "]", "\"", "\\"}; + +// Transition table +const std::vector> transition_table = { + /* IN_STATE { [ } ] " \ OTHER */ + /* TT_OOS */ {TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS}, + /* TT_STR */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR, TT_STR}, + /* TT_ESC */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}; + +// Translation table (i.e., for each transition, what are the symbols that we output) +const std::vector>> translation_table = { + /* IN_STATE { [ } ] " \ OTHER */ + /* TT_OOS */ {{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}, + /* TT_STR */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}, + /* TT_ESC */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}; + +// The DFA's starting state +constexpr int32_t start_state = TT_OOS; +} // namespace to_stack_op + +//------------------------------------------------------------------------------ +// JSON TOKENIZER PUSHDOWN AUTOMATON +//------------------------------------------------------------------------------ +namespace tokenizer_pda { + +/** + * @brief Symbol groups for the input alphabet for the pushdown automaton + */ +enum SGID : PdaSymbolGroupIdT { + /// Opening brace + OBC, + /// Opening bracket + OBT, + /// Closing brace + CBC, + /// Closing bracket + CBT, + /// Quote + QTE, + /// Escape + ESC, + /// Comma + CMA, + /// Colon + CLN, + /// Whitespace + WSP, + /// Other (any input symbol not assigned to one of the above symbol groups) + OTR, + /// Total number of symbol groups amongst which to differentiate + NUM_PDA_INPUT_SGS +}; + +/** + * @brief Symbols in the stack alphabet + */ +enum STACK_SGID : PdaStackSymbolGroupIdT { + /// Symbol representing the JSON-root (i.e., we're at nesting level '0') + STACK_ROOT = 0, + + /// Symbol representing that we're currently within a list object + STACK_LIST = 1, + + /// Symbol representing that we're currently within a struct object + STACK_STRUCT = 2, + + /// Total number of symbols in the stack alphabet + NUM_STACK_SGS +}; + +/// Total number of symbol groups to differentiate amongst (stack alphabet * input alphabet) +constexpr PdaSymbolGroupIdT NUM_PDA_SGIDS = NUM_PDA_INPUT_SGS * NUM_STACK_SGS; + +/// Mapping a input symbol to the symbol group id +static __constant__ PdaSymbolGroupIdT tos_sg_to_pda_sgid[] = { + OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, WSP, WSP, OTR, OTR, WSP, OTR, OTR, OTR, OTR, OTR, + OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, WSP, OTR, QTE, OTR, OTR, OTR, + OTR, OTR, OTR, OTR, OTR, OTR, CMA, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, + OTR, CLN, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, + OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OBT, ESC, CBT, OTR, + OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, + OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OBC, OTR, CBC, OTR}; + +/** + * @brief Maps a (top-of-stack symbol, input symbol)-pair to a symbol group id of the DVPA + */ +struct PdaSymbolToSymbolGroupId { + template + __device__ __forceinline__ PdaSymbolGroupIdT + operator()(thrust::tuple symbol_pair) + { + // The symbol read from the input + auto symbol = thrust::get<0>(symbol_pair); + + // The stack symbol (i.e., what is on top of the stack at the time the input symbol was read) + // I.e., whether we're reading in something within a struct, a list, or the JSON root + auto stack_symbol = thrust::get<1>(symbol_pair); + + // The stack symbol offset: '_' is the root group (0), '[' is the list group (1), '{' is the + // struct group (2) + int32_t stack_idx = + (stack_symbol == '_') ? STACK_ROOT : ((stack_symbol == '[') ? STACK_LIST : STACK_STRUCT); + + // The relative symbol group id of the current input symbol + PdaSymbolGroupIdT symbol_gid = tos_sg_to_pda_sgid[min( + static_cast(symbol), + static_cast(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0])) - 1)]; + return stack_idx * NUM_PDA_INPUT_SGS + symbol_gid; + } +}; + +// The states defined by the pushdown automaton +enum pda_state_t : int32_t { + PD_BOV, + PD_BOA, + PD_LON, + PD_STR, + PD_SCE, + PD_PVL, + PD_BFN, + PD_FLN, + PD_FNE, + PD_PFN, + PD_ERR, + PD_NUM_STATES +}; + +// The starting state of the pushdown automaton +constexpr int32_t start_state = PD_BOV; + +// Identity symbol to symbol group lookup table +const std::vector> pda_sgids{ + {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}, {13}, {14}, + {15}, {16}, {17}, {18}, {19}, {20}, {21}, {22}, {23}, {24}, {25}, {26}, {27}, {28}, {29}}; + +/** + * @brief Getting the transition table + */ +std::vector> get_transition_table() +{ + std::vector> pda_tt(PD_NUM_STATES); + pda_tt[PD_BOV] = {PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON}; + pda_tt[PD_BOA] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_ERR}; + pda_tt[PD_LON] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_LON}; + pda_tt[PD_STR] = {PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[PD_SCE] = {PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[PD_PVL] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_ERR}; + pda_tt[PD_BFN] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR}; + pda_tt[PD_FLN] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + pda_tt[PD_FNE] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + pda_tt[PD_PFN] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_ERR}; + pda_tt[PD_ERR] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR}; + return pda_tt; +} + +/** + * @brief Getting the translation table + */ +std::vector>> get_translation_table() +{ + std::vector>> pda_tlt(PD_NUM_STATES); + pda_tlt[PD_BOV] = {{TK_BOS}, {TK_BOL}, {TK_ERR}, {TK_ERR}, {TK_BST}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {}, {TK_BOV}, {TK_BOS}, {TK_BOL}, {TK_ERR}, {TK_ERR}, {TK_BST}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {}, {TK_BOV}, {TK_BOS}, {TK_BOL}, {TK_ERR}, {TK_ERR}, + {TK_BST}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {}, {TK_BOV}}; + pda_tlt[PD_BOA] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_BOS}, {TK_BOL}, {TK_ERR}, {TK_EOL}, {TK_BST}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {}, {TK_BOV}, {TK_ERR}, {TK_ERR}, {TK_EOS}, {TK_ERR}, + {TK_BFN}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {}, {TK_ERR}}; + pda_tlt[PD_LON] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_POV}, {}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_POV, TK_EOL}, {TK_ERR}, + {TK_ERR}, {TK_POV}, {TK_ERR}, {TK_POV}, {}, + {TK_ERR}, {TK_ERR}, {TK_POV, TK_EOS}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_POV}, {TK_ERR}, {TK_POV}, {}}; + pda_tlt[PD_STR] = {{}, {}, {}, {}, {TK_EST}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {TK_EST}, + {}, {}, {}, {}, {}, {}, {}, {}, {}, {TK_EST}, {}, {}, {}, {}, {}}; + pda_tlt[PD_SCE] = {{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, + {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}}; + pda_tlt[PD_PVL] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_EOL}, {TK_ERR}, {TK_ERR}, + {}, {TK_ERR}, {}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_EOS}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {}, {TK_ERR}, {}, {TK_ERR}}; + pda_tlt[PD_BFN] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_BFN}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {}, {TK_ERR}}; + pda_tlt[PD_FLN] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {}, {}, {}, {}, + {TK_EFN}, {}, {}, {}, {}, {}}; + pda_tlt[PD_FNE] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {}, {}, {}, {}, + {}, {}, {}, {}, {}, {}}; + pda_tlt[PD_PFN] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {}, {}, {TK_ERR}}; + pda_tlt[PD_ERR] = {{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, + {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}}; + return pda_tlt; +} + +} // namespace tokenizer_pda + +/** + * @brief Function object used to filter for brackets and braces that represent push and pop + * operations + * + */ +struct JSONToStackOp { + template + constexpr CUDF_HOST_DEVICE fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const + { + return (stack_symbol == '{' || stack_symbol == '[') ? fst::stack_op_type::PUSH + : (stack_symbol == '}' || stack_symbol == ']') ? fst::stack_op_type::POP + : fst::stack_op_type::READ; + } +}; + +void get_stack_context(device_span d_json_in, + device_span d_top_of_stack, + rmm::cuda_stream_view stream) +{ + constexpr std::size_t single_item = 1; + + // Symbol that will represent empty-stack (i.e., that we're at the DOM root) + constexpr StackSymbolT root_symbol = '_'; + // This can be any stack symbol from the stack alphabet that does not push onto stack + constexpr StackSymbolT read_symbol = 'x'; + + // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes) + hostdevice_vector d_num_stack_ops(single_item, stream); + + // Sequence of stack symbols and their position in the original input (sparse representation) + rmm::device_uvector d_stack_ops{d_json_in.size(), stream}; + rmm::device_uvector d_stack_op_indices{d_json_in.size(), stream}; + + // Prepare finite-state transducer that only selects '{', '}', '[', ']' outside of quotes + using ToStackOpFstT = cudf::io::fst::detail::Dfa; + ToStackOpFstT json_to_stack_ops_fst{to_stack_op::symbol_groups, + to_stack_op::transition_table, + to_stack_op::translation_table, + stream}; + + // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end + // structs/lists + json_to_stack_ops_fst.Transduce(d_json_in.begin(), + static_cast(d_json_in.size()), + d_stack_ops.data(), + d_stack_op_indices.data(), + d_num_stack_ops.device_ptr(), + to_stack_op::start_state, + stream); + + // Request temporary storage requirements + fst::sparse_stack_op_to_top_of_stack( + d_stack_ops.data(), + device_span{d_stack_op_indices.data(), d_stack_op_indices.size()}, + JSONToStackOp{}, + d_top_of_stack.data(), + root_symbol, + read_symbol, + d_json_in.size(), + stream); +} + +void get_token_stream(device_span d_json_in, + device_span d_tokens, + device_span d_tokens_indices, + SymbolOffsetT* d_num_written_tokens, + rmm::cuda_stream_view stream) +{ + // Memory holding the top-of-stack stack context for the input + rmm::device_uvector d_top_of_stack{d_json_in.size(), stream}; + + // Identify what is the stack context for each input character (is it: JSON-root, struct, or list) + get_stack_context(d_json_in, d_top_of_stack, stream); + + // Prepare for PDA transducer pass, merging input symbols with stack symbols + rmm::device_uvector d_pda_sgids{d_json_in.size(), stream}; + auto zip_in = thrust::make_zip_iterator(d_json_in.data(), d_top_of_stack.data()); + thrust::transform(rmm::exec_policy(stream), + zip_in, + zip_in + d_json_in.size(), + d_pda_sgids.data(), + tokenizer_pda::PdaSymbolToSymbolGroupId{}); + + // PDA transducer alias + using ToTokenStreamFstT = cudf::io::fst::detail:: + Dfa; + + // Instantiating PDA transducer + ToTokenStreamFstT json_to_tokens_fst{tokenizer_pda::pda_sgids, + tokenizer_pda::get_transition_table(), + tokenizer_pda::get_translation_table(), + stream}; + + // Perform a PDA-transducer pass + json_to_tokens_fst.Transduce(d_pda_sgids.begin(), + static_cast(d_json_in.size()), + d_tokens.data(), + d_tokens_indices.data(), + d_num_written_tokens, + tokenizer_pda::start_state, + stream); +} + +} // namespace gpu +} // namespace json +} // namespace io +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d70a3d9518d..9a61daf6f8a 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -222,6 +222,7 @@ ConfigureTest(FILE_IO_TEST io/file_io_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp) ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp) +ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cu) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu) diff --git a/cpp/tests/io/nested_json_test.cu b/cpp/tests/io/nested_json_test.cu new file mode 100644 index 00000000000..6336f493c17 --- /dev/null +++ b/cpp/tests/io/nested_json_test.cu @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +namespace nested_json = cudf::io::json::gpu; + +// Base test fixture for tests +struct JsonTest : public cudf::test::BaseFixture { +}; + +TEST_F(JsonTest, StackContext) +{ + // Type used to represent the atomic symbol type used within the finite-state machine + using SymbolT = char; + using StackSymbolT = char; + + // Prepare cuda stream for data transfers & kernels + cudaStream_t stream = nullptr; + cudaStreamCreate(&stream); + rmm::cuda_stream_view stream_view(stream); + + // Test input + std::string input = R"( [{)" + R"("category": "reference",)" + R"("index:": [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "[Sayings of the Century]",)" + R"("price": 8.95)" + R"(}, )" + R"({)" + R"("category": "reference",)" + R"("index": [4,{},null,{"a":[{ }, {}] } ],)" + R"("author": "Nigel Rees",)" + R"("title": "{}[], <=semantic-symbols-string",)" + R"("price": 8.95)" + R"(}] )"; + + // Prepare input & output buffers + rmm::device_uvector d_input(input.size(), stream_view); + hostdevice_vector stack_context(input.size(), stream_view); + + ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( + d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); + + // Run algorithm + cudf::io::json::gpu::get_stack_context( + d_input, + cudf::device_span{stack_context.device_ptr(), stack_context.size()}, + stream); + + // Copy back the results + stack_context.device_to_host(stream); + + // Make sure we copied back the stack context + stream_view.synchronize(); + + std::vector golden_stack_context{ + '_', '_', '_', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '[', '[', '[', '[', '[', '[', '[', '[', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '[', '[', '[', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '[', '[', + '{', '[', '[', '[', '[', '[', '[', '[', '{', '{', '{', '{', '{', '[', '{', '{', '[', '[', + '[', '{', '[', '{', '{', '[', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '_'}; + + ASSERT_EQ(golden_stack_context.size(), stack_context.size()); + for (std::size_t i = 0; i < stack_context.size() && i < 1000; i++) { + ASSERT_EQ(golden_stack_context[i], stack_context[i]); + } +} + +TEST_F(JsonTest, TokenStream) +{ + using cudf::io::json::gpu::PdaTokenT; + using cudf::io::json::gpu::SymbolOffsetT; + using cudf::io::json::gpu::SymbolT; + + constexpr std::size_t single_item = 1; + + // Prepare cuda stream for data transfers & kernels + cudaStream_t stream = nullptr; + cudaStreamCreate(&stream); + rmm::cuda_stream_view stream_view(stream); + + // Test input + std::string input = R"( [{)" + R"("category": "reference",)" + R"("index:": [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "[Sayings of the Century]",)" + R"("price": 8.95)" + R"(}, )" + R"({)" + R"("category": "reference",)" + R"("index": [4,{},null,{"a":[{ }, {}] } ],)" + R"("author": "Nigel Rees",)" + R"("title": "{}[], <=semantic-symbols-string",)" + R"("price": 8.95)" + R"(}] )"; + + // Prepare input & output buffers + rmm::device_uvector d_input(input.size(), stream_view); + + ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( + d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); + + + hostdevice_vector tokens_gpu{input.size(), stream}; + hostdevice_vector token_indices_gpu{input.size(), stream}; + hostdevice_vector num_tokens_out{single_item, stream}; + + // Parse the JSON and get the token stream + cudf::io::json::gpu::get_token_stream( + d_input, + cudf::device_span{tokens_gpu.device_ptr(), tokens_gpu.size()}, + cudf::device_span{token_indices_gpu.device_ptr(), token_indices_gpu.size()}, + num_tokens_out.device_ptr(), + stream); + + // Copy back the number of tokens that were written + num_tokens_out.device_to_host(stream); + tokens_gpu.device_to_host(stream); + token_indices_gpu.device_to_host(stream); + + // Make sure we copied back all relevant data + stream_view.synchronize(); + + // Golden token stream sample + std::vector> golden_token_stream = { + {2, nested_json::TK_BOL}, {3, nested_json::TK_BOS}, {4, nested_json::TK_BFN}, + {13, nested_json::TK_EFN}, {16, nested_json::TK_BST}, {26, nested_json::TK_EST}, + {28, nested_json::TK_BFN}, {35, nested_json::TK_EFN}, {38, nested_json::TK_BOL}, + {39, nested_json::TK_BOV}, {40, nested_json::TK_POV}, {41, nested_json::TK_BOV}, + {43, nested_json::TK_POV}, {44, nested_json::TK_BOV}, {46, nested_json::TK_POV}, + {46, nested_json::TK_EOL}, {48, nested_json::TK_BFN}, {55, nested_json::TK_EFN}, + {58, nested_json::TK_BST}, {69, nested_json::TK_EST}, {71, nested_json::TK_BFN}, + {77, nested_json::TK_EFN}, {80, nested_json::TK_BST}, {105, nested_json::TK_EST}, + {107, nested_json::TK_BFN}, {113, nested_json::TK_EFN}, {116, nested_json::TK_BOV}, + {120, nested_json::TK_POV}, {120, nested_json::TK_EOS}, {124, nested_json::TK_BOS}, + {125, nested_json::TK_BFN}, {134, nested_json::TK_EFN}, {137, nested_json::TK_BST}, + {147, nested_json::TK_EST}, {149, nested_json::TK_BFN}, {155, nested_json::TK_EFN}, + {158, nested_json::TK_BOL}, {159, nested_json::TK_BOV}, {160, nested_json::TK_POV}, + {161, nested_json::TK_BOS}, {162, nested_json::TK_EOS}, {164, nested_json::TK_BOV}, + {168, nested_json::TK_POV}, {169, nested_json::TK_BOS}, {170, nested_json::TK_BFN}, + {172, nested_json::TK_EFN}, {174, nested_json::TK_BOL}, {175, nested_json::TK_BOS}, + {177, nested_json::TK_EOS}, {180, nested_json::TK_BOS}, {181, nested_json::TK_EOS}, + {182, nested_json::TK_EOL}, {184, nested_json::TK_EOS}, {186, nested_json::TK_EOL}, + {188, nested_json::TK_BFN}, {195, nested_json::TK_EFN}, {198, nested_json::TK_BST}, + {209, nested_json::TK_EST}, {211, nested_json::TK_BFN}, {217, nested_json::TK_EFN}, + {220, nested_json::TK_BST}, {252, nested_json::TK_EST}, {254, nested_json::TK_BFN}, + {260, nested_json::TK_EFN}, {263, nested_json::TK_BOV}, {267, nested_json::TK_POV}, + {267, nested_json::TK_EOS}, {268, nested_json::TK_EOL}}; + + // Verify the number of tokens matches + ASSERT_EQ(golden_token_stream.size(), num_tokens_out[0]); + + for (std::size_t i = 0; i < num_tokens_out[0]; i++) { + // Ensure the index the tokens are pointing to do match + ASSERT_EQ(golden_token_stream[i].first, token_indices_gpu[i]); + // Ensure the token category is correct + ASSERT_EQ(golden_token_stream[i].second, tokens_gpu[i]); + } +} From ff22f197cb3a80a57f0aea14b67a143944cfccdd Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 13 May 2022 09:52:20 -0700 Subject: [PATCH 025/173] squash & rebase on latest tokenizer version --- cpp/src/io/json/nested_json.h | 49 ++++++++ cpp/src/io/json/nested_json_gpu.cu | 178 ++++++++++++++++++++++++++++- cpp/tests/io/nested_json_test.cu | 172 ++++++++++++++++++++++++++++ 3 files changed, 397 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/nested_json.h b/cpp/src/io/json/nested_json.h index 58f30c7b9ac..69c6c63ea52 100644 --- a/cpp/src/io/json/nested_json.h +++ b/cpp/src/io/json/nested_json.h @@ -49,6 +49,24 @@ using PdaSymbolGroupIdT = char; /// Type being emitted by the pushdown automaton transducer using PdaTokenT = char; +/// Type used to represent the class of a node (or a node "category") within the tree representation +using NodeT = char; + +/// Type used to index into the nodes within the tree of structs, lists, field names, and value +/// nodes +using NodeIndexT = uint32_t; + +/// Type large enough to represent tree depth from [0, max-tree-depth); may be an unsigned type +using TreeDepthT = StackLevelT; + +using tree_meta_t = std::tuple, + std::vector, + std::vector, + std::vector, + std::vector>; + +constexpr NodeIndexT parent_node_sentinel = std::numeric_limits::max(); + /** * @brief Tokens emitted while parsing a JSON input */ @@ -79,6 +97,26 @@ enum token_t : PdaTokenT { NUM_TOKENS }; +/** + * @brief Class of a node (or a node "category") within the tree representation + */ +enum node_t : NodeT { + /// A node representing a struct + NC_STRUCT, + /// A node representing a list + NC_LIST, + /// A node representing a field name + NC_FN, + /// A node representing a string value + NC_STR, + /// A node representing a numeric or literal value (e.g., true, false, null) + NC_VAL, + /// A node representing a parser error + NC_ERR, + /// Total number of node classes + NUM_NODE_CLASSES +}; + /** * @brief Identifies the stack context for each character from a JSON input. Specifically, we * identify brackets and braces outside of quoted fields (e.g., field names, strings). @@ -110,6 +148,17 @@ void get_token_stream(device_span d_json_in, SymbolOffsetT* d_num_written_tokens, rmm::cuda_stream_view stream); +/** + * @briefTakes a JSON input in host memory and returns the tree representation of the JSON input. + * Specifically, the host-side JSON input is transferred to the GPU, where the JSON tokenizer is + * run. The token stream is then copied back to the CPU where the tree representation is computed. + * + * @param input The JSON input + * @param stream The CUDA stream to which kernels and memcpy'ies are dispatched + * @return Returns a tree representation of the JSON input on the host + */ +tree_meta_t get_tree_representation(host_span input, rmm::cuda_stream_view stream); + } // namespace gpu } // namespace json } // namespace io diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index ae1767bf63a..723918cc4d6 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -25,6 +25,8 @@ #include +#include + namespace cudf { namespace io { namespace json { @@ -207,7 +209,7 @@ const std::vector> pda_sgids{ {15}, {16}, {17}, {18}, {19}, {20}, {21}, {22}, {23}, {24}, {25}, {26}, {27}, {28}, {29}}; /** - * @brief Getting the transition table + * @brief Getting the transition table */ std::vector> get_transition_table() { @@ -249,7 +251,7 @@ std::vector> get_transition_table() } /** - * @brief Getting the translation table + * @brief Getting the translation table */ std::vector>> get_translation_table() { @@ -404,6 +406,178 @@ void get_token_stream(device_span d_json_in, stream); } +tree_meta_t get_tree_representation(host_span input, rmm::cuda_stream_view stream) +{ + constexpr std::size_t single_item = 1; + hostdevice_vector tokens_gpu{input.size(), stream}; + hostdevice_vector token_indices_gpu{input.size(), stream}; + hostdevice_vector num_tokens_out{single_item, stream}; + + rmm::device_uvector d_input{input.size(), stream}; + cudaMemcpyAsync( + d_input.data(), input.data(), input.size() * sizeof(input[0]), cudaMemcpyHostToDevice, stream); + + // Parse the JSON and get the token stream + cudf::io::json::gpu::get_token_stream( + cudf::device_span{d_input.data(), d_input.size()}, + cudf::device_span{tokens_gpu.device_ptr(), tokens_gpu.size()}, + cudf::device_span{token_indices_gpu.device_ptr(), token_indices_gpu.size()}, + num_tokens_out.device_ptr(), + stream); + + // Copy the JSON tokens to the host + token_indices_gpu.device_to_host(stream); + tokens_gpu.device_to_host(stream); + num_tokens_out.device_to_host(stream); + + // Make sure tokens have been copied to the host + stream.synchronize(); + + // Whether a token does represent a node in the tree representation + auto is_node = [](PdaTokenT const token) { + switch (token) { + case TK_BOS: + case TK_BOL: + case TK_BST: + case TK_BOV: + case TK_BFN: + case TK_ERR: return true; + default: return false; + }; + }; + + // The node that a token represents + auto token_to_node = [](PdaTokenT const token) { + switch (token) { + case TK_BOS: return NC_STRUCT; + case TK_BOL: return NC_LIST; + case TK_BST: return NC_STR; + case TK_BOV: return NC_VAL; + case TK_BFN: return NC_FN; + default: return NC_ERR; + }; + }; + + auto get_token_index = [](PdaTokenT const token, SymbolOffsetT const token_index) { + constexpr SymbolOffsetT skip_quote_char = 1; + switch (token) { + case TK_BST: return token_index + skip_quote_char; + case TK_BFN: return token_index + skip_quote_char; + default: return token_index; + }; + }; + + // Whether a token expects to be followed by its respective end-of-* token partner + auto is_begin_of_section = [](PdaTokenT const token) { + switch (token) { + case TK_BST: + case TK_BOV: + case TK_BFN: return true; + default: return false; + }; + }; + + // The end-of-* partner token for a given beginning-of-* token + auto end_of_partner = [](PdaTokenT const token) { + switch (token) { + case TK_BST: return TK_EST; + case TK_BOV: return TK_POV; + case TK_BFN: return TK_EFN; + default: return TK_ERR; + }; + }; + + // Whether the token pops from the parent node stack + auto does_pop = [](PdaTokenT const token) { + switch (token) { + case TK_EOS: + case TK_EOL: return true; + default: return false; + }; + }; + + // Whether the token pushes onto the parent node stack + auto does_push = [](PdaTokenT const token) { + switch (token) { + case TK_BOS: + case TK_BOL: return true; + default: return false; + }; + }; + + // The node id sitting on top of the stack becomes the node's parent + // The full stack represents the path from the root to the current node + std::stack> parent_stack; + + constexpr bool field_name_node = true; + constexpr bool no_field_name_node = false; + + std::vector node_categories; + std::vector parent_node_ids; + std::vector node_levels; + std::vector node_range_begin; + std::vector node_range_end; + + std::size_t node_id = 0; + for (std::size_t i = 0; i < num_tokens_out[0]; i++) { + auto token = tokens_gpu[i]; + + // The section from the original JSON input that this token demarcates + std::size_t range_begin = get_token_index(token, token_indices_gpu[i]); + std::size_t range_end = range_begin + 1; + + // Identify this node's parent node id + std::size_t parent_node_id = + (parent_stack.size() > 0) ? parent_stack.top().first : parent_node_sentinel; + + // If this token is the beginning-of-{value, string, field name}, also consume the next end-of-* + // token + if (is_begin_of_section(token)) { + if ((i + 1) < num_tokens_out[0] && end_of_partner(tokens_gpu[i + 1])) { + // Update the range_end for this pair of tokens + range_end = token_indices_gpu[i + 1]; + // We can skip the subsequent end-of-* token + i++; + } + } + + // Emit node if this token becomes a node in the tree + if (is_node(token)) { + node_categories.push_back(token_to_node(token)); + parent_node_ids.push_back(parent_node_id); + node_levels.push_back(parent_stack.size()); + node_range_begin.push_back(range_begin); + node_range_end.push_back(range_end); + } + + // Modify the stack if needed + if (token == TK_BFN) { + parent_stack.push({node_id, field_name_node}); + } else { + if (does_push(token)) { + parent_stack.push({node_id, no_field_name_node}); + } else if (does_pop(token)) { + CUDF_EXPECTS(parent_stack.size() >= 1, "Invalid JSON input."); + parent_stack.pop(); + } + + // If what we're left with is a field name on top of stack, we need to pop it + if (parent_stack.size() >= 1 && parent_stack.top().second == field_name_node) { + parent_stack.pop(); + } + } + + // Update node_id + if (is_node(token)) { node_id++; } + } + + return std::make_tuple(std::move(node_categories), + std::move(parent_node_ids), + std::move(node_levels), + std::move(node_range_begin), + std::move(node_range_end)); +} + } // namespace gpu } // namespace json } // namespace io diff --git a/cpp/tests/io/nested_json_test.cu b/cpp/tests/io/nested_json_test.cu index 6336f493c17..8b721ce2548 100644 --- a/cpp/tests/io/nested_json_test.cu +++ b/cpp/tests/io/nested_json_test.cu @@ -20,6 +20,9 @@ #include #include +#include +#include + namespace nested_json = cudf::io::json::gpu; // Base test fixture for tests @@ -187,3 +190,172 @@ TEST_F(JsonTest, TokenStream) ASSERT_EQ(golden_token_stream[i].second, tokens_gpu[i]); } } + +std::string get_node_string(std::size_t const node_id, + nested_json::tree_meta_t const& tree_rep, + std::string const& json_input) +{ + auto const& node_categories = std::get<0>(tree_rep); + auto const& parent_node_ids = std::get<1>(tree_rep); + auto const& node_levels = std::get<2>(tree_rep); + auto const& node_range_begin = std::get<3>(tree_rep); + auto const& node_range_end = std::get<4>(tree_rep); + + auto node_to_str = [] __host__ __device__(nested_json::PdaTokenT const token) { + switch (token) { + case nested_json::NC_STRUCT: return "STRUCT"; + case nested_json::NC_LIST: return "LIST"; + case nested_json::NC_FN: return "FN"; + case nested_json::NC_STR: return "STR"; + case nested_json::NC_VAL: return "VAL"; + case nested_json::NC_ERR: return "ERR"; + default: return "N/A"; + }; + }; + + return "<" + std::to_string(node_id) + ":" + node_to_str(node_categories[node_id]) + ":[" + + std::to_string(node_range_begin[node_id]) + ", " + + std::to_string(node_range_end[node_id]) + ") '" + + json_input.substr(node_range_begin[node_id], + node_range_end[node_id] - node_range_begin[node_id]) + + "'>"; +} + +void print_tree_representation(std::string const& json_input, + nested_json::tree_meta_t const& tree_rep) +{ + for (std::size_t i = 0; i < std::get<0>(tree_rep).size(); i++) { + auto const& parent_node_ids = std::get<1>(tree_rep); + std::size_t parent_id = parent_node_ids[i]; + std::stack path; + path.push(i); + while (parent_id != nested_json::parent_node_sentinel) { + path.push(parent_id); + parent_id = parent_node_ids[parent_id]; + } + + while (path.size()) { + auto const node_id = path.top(); + std::cout << get_node_string(node_id, tree_rep, json_input) + << (path.size() > 1 ? " -> " : ""); + path.pop(); + } + std::cout << "\n"; + } +} + +TEST_F(JsonTest, TreeRepresentation) +{ + using nested_json::PdaTokenT; + using nested_json::SymbolOffsetT; + using nested_json::SymbolT; + + // Prepare cuda stream for data transfers & kernels + cudaStream_t stream = nullptr; + cudaStreamCreate(&stream); + rmm::cuda_stream_view stream_view(stream); + + // Test input + std::string input = R"( [{)" + R"("category": "reference",)" + R"("index:": [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "[Sayings of the Century]",)" + R"("price": 8.95)" + R"(}, )" + R"({)" + R"("category": "reference",)" + R"("index": [4,{},null,{"a":[{ }, {}] } ],)" + R"("author": "Nigel Rees",)" + R"("title": "{}[], <=semantic-symbols-string",)" + R"("price": 8.95)" + R"(}] )"; + + // Get the JSON's tree representation + auto tree_rep = nested_json::get_tree_representation( + cudf::host_span{input.data(), input.size()}, stream_view); + + auto const& node_categories = std::get<0>(tree_rep); + auto const& parent_node_ids = std::get<1>(tree_rep); + auto const& node_levels = std::get<2>(tree_rep); + auto const& node_range_begin = std::get<3>(tree_rep); + auto const& node_range_end = std::get<4>(tree_rep); + + // Golden sample of node categories + std::vector golden_node_categories = { + nested_json::NC_LIST, nested_json::NC_STRUCT, nested_json::NC_FN, nested_json::NC_STR, + nested_json::NC_FN, nested_json::NC_LIST, nested_json::NC_VAL, nested_json::NC_VAL, + nested_json::NC_VAL, nested_json::NC_FN, nested_json::NC_STR, nested_json::NC_FN, + nested_json::NC_STR, nested_json::NC_FN, nested_json::NC_VAL, nested_json::NC_STRUCT, + nested_json::NC_FN, nested_json::NC_STR, nested_json::NC_FN, nested_json::NC_LIST, + nested_json::NC_VAL, nested_json::NC_STRUCT, nested_json::NC_VAL, nested_json::NC_STRUCT, + nested_json::NC_FN, nested_json::NC_LIST, nested_json::NC_STRUCT, nested_json::NC_STRUCT, + nested_json::NC_FN, nested_json::NC_STR, nested_json::NC_FN, nested_json::NC_STR, + nested_json::NC_FN, nested_json::NC_VAL}; + + // Golden sample of node ids + std::vector golden_parent_node_ids = {nested_json::parent_node_sentinel, + 0, + 1, + 2, + 1, + 4, + 5, + 5, + 5, + 1, + 9, + 1, + 11, + 1, + 13, + 0, + 15, + 16, + 15, + 18, + 19, + 19, + 19, + 19, + 23, + 24, + 25, + 25, + 15, + 28, + 15, + 30, + 15, + 32}; + + // Golden sample of node levels + std::vector golden_node_levels = {0, 1, 2, 3, 2, 3, 4, 4, 4, 2, 3, 2, + 3, 2, 3, 1, 2, 3, 2, 3, 4, 4, 4, 4, + 5, 6, 7, 7, 2, 3, 2, 3, 2, 3}; + + // Golden sample of the character-ranges from the original input that each node demarcates + std::vector golden_node_range_begin = { + 2, 3, 5, 17, 29, 38, 39, 41, 44, 49, 59, 72, 81, 108, 116, 124, 126, + 138, 150, 158, 159, 161, 164, 169, 171, 174, 175, 180, 189, 199, 212, 221, 255, 263}; + + // Golden sample of the character-ranges from the original input that each node demarcates + std::vector golden_node_range_end = { + 3, 4, 13, 26, 35, 39, 40, 43, 46, 55, 69, 77, 105, 113, 120, 125, 134, + 147, 155, 159, 160, 162, 168, 170, 172, 175, 176, 181, 195, 209, 217, 252, 260, 267}; + + // Check results against golden samples + ASSERT_EQ(golden_node_categories.size(), node_categories.size()); + ASSERT_EQ(golden_parent_node_ids.size(), parent_node_ids.size()); + ASSERT_EQ(golden_node_levels.size(), node_levels.size()); + ASSERT_EQ(golden_node_range_begin.size(), node_range_begin.size()); + ASSERT_EQ(golden_node_range_end.size(), node_range_end.size()); + + for (std::size_t i = 0; i < golden_node_categories.size(); i++) { + ASSERT_EQ(golden_node_categories[i], node_categories[i]); + ASSERT_EQ(golden_parent_node_ids[i], parent_node_ids[i]); + ASSERT_EQ(golden_node_levels[i], node_levels[i]); + ASSERT_EQ(golden_node_range_begin[i], node_range_begin[i]); + ASSERT_EQ(golden_node_range_end[i], node_range_end[i]); + } +} From 365b839b645457dc2706e913347b28fd01389dfa Mon Sep 17 00:00:00 2001 From: vuule Date: Sat, 11 Jun 2022 01:55:13 -0700 Subject: [PATCH 026/173] start --- cpp/src/io/json/data_casting.cu | 29 +++++++++++++++++++++ cpp/src/io/json/data_casting.cuh | 33 ++++++++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/io/json_type_cast_test.cpp | 38 ++++++++++++++++++++++++++++ 4 files changed, 101 insertions(+) create mode 100644 cpp/src/io/json/data_casting.cu create mode 100644 cpp/src/io/json/data_casting.cuh create mode 100644 cpp/tests/io/json_type_cast_test.cpp diff --git a/cpp/src/io/json/data_casting.cu b/cpp/src/io/json/data_casting.cu new file mode 100644 index 00000000000..28e3bd7bfcf --- /dev/null +++ b/cpp/src/io/json/data_casting.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "data_casting.cuh" + +namespace cudf::io::json::experimental { + +template +void parse_data(device_span str_data_ptrs, + device_span str_data_sizes, + host_span col_types, + std::vector cols, + rmm::cuda_stream_view stream) +{ +} +} // namespace cudf::io::json::experimental \ No newline at end of file diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh new file mode 100644 index 00000000000..fd632b20433 --- /dev/null +++ b/cpp/src/io/json/data_casting.cuh @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +namespace cudf::io::json::experimental { + +template +void parse_data(device_span str_data_ptrs, + device_span str_data_sizes, + host_span col_types, + std::vector cols, + rmm::cuda_stream_view stream); + +} \ No newline at end of file diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 816c5a1c59c..f3b9db1b114 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -222,6 +222,7 @@ ConfigureTest(FILE_IO_TEST io/file_io_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp) ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp) +ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cpp) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) if(CUDF_ENABLE_ARROW_S3) diff --git a/cpp/tests/io/json_type_cast_test.cpp b/cpp/tests/io/json_type_cast_test.cpp new file mode 100644 index 00000000000..de2f7c81280 --- /dev/null +++ b/cpp/tests/io/json_type_cast_test.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +struct JSONTypeCastTest : public cudf::test::BaseFixture { +}; + +TEST_F(JSONTypeCastTest, Basic) { EXPECT_TRUE(1); } + +CUDF_TEST_PROGRAM_MAIN() \ No newline at end of file From 419d3c8d7fe072a1bfb7043a9176f36ad9e0333f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 17 Jun 2022 12:07:52 -0400 Subject: [PATCH 027/173] Add type inference test in CMake --- cpp/tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 9a61daf6f8a..3105e1a87df 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -226,7 +226,7 @@ ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cu) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu) -ConfigureTest(FST_TEST io/fst/fst_test.cu) +ConfigureTest(FST_TEST io/fst/fst_test.cu io/fst/type_inference_test.cu) if(CUDF_ENABLE_ARROW_S3) target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED") endif() From f32b9b9c81a7203cf5f2c2647db8ca0d91090e7c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 17 Jun 2022 15:37:10 -0400 Subject: [PATCH 028/173] Add type inference prototype and basic test --- cpp/src/io/fst/type_inference.cuh | 250 ++++++++++++++++++++++++ cpp/tests/io/fst/type_inference_test.cu | 60 ++++++ 2 files changed, 310 insertions(+) create mode 100644 cpp/src/io/fst/type_inference.cuh create mode 100644 cpp/tests/io/fst/type_inference_test.cu diff --git a/cpp/src/io/fst/type_inference.cuh b/cpp/src/io/fst/type_inference.cuh new file mode 100644 index 00000000000..5bf687e106e --- /dev/null +++ b/cpp/src/io/fst/type_inference.cuh @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include + +#include + +namespace cudf { +namespace io { +namespace fst { +namespace detail { +/** + * @brief Structure for type inference options + */ +struct inference_options_view { + cudf::detail::trie_view trie_true; + cudf::detail::trie_view trie_false; + cudf::detail::trie_view trie_na; +}; + +struct inference_options { + cudf::detail::optional_trie trie_true; + cudf::detail::optional_trie trie_false; + cudf::detail::optional_trie trie_na; + + [[nodiscard]] inference_options_view view() const + { + return {cudf::detail::make_trie_view(trie_true), + cudf::detail::make_trie_view(trie_false), + cudf::detail::make_trie_view(trie_na)}; + } +}; + +/** + * @brief Returns true is the input character is a valid digit. + * Supports both decimal and hexadecimal digits (uppercase and lowercase). + * + * @param c Character to check + * @param is_hex Whether to check as a hexadecimal + * + * @return `true` if it is digit-like, `false` otherwise + */ +__device__ __inline__ bool is_digit(char const c, bool const is_hex = false) +{ + if (c >= '0' && c <= '9') return true; + + if (is_hex) { + if (c >= 'A' && c <= 'F') return true; + if (c >= 'a' && c <= 'f') return true; + } + + return false; +} + +/** + * @brief Returns true if the counters indicate a potentially valid float. + * False positives are possible because positions are not taken into account. + * For example, field "e.123-" would match the pattern. + */ +__device__ __inline__ bool is_like_float( + long len, long digit_cnt, long decimal_cnt, long dash_cnt, long exponent_cnt) +{ + // Can't have more than one exponent and one decimal point + if (decimal_cnt > 1) return false; + if (exponent_cnt > 1) return false; + // Without the exponent or a decimal point, this is an integer, not a float + if (decimal_cnt == 0 && exponent_cnt == 0) return false; + + // Can only have one '-' per component + if (dash_cnt > 1 + exponent_cnt) return false; + + // If anything other than these characters is present, it's not a float + if (digit_cnt + decimal_cnt + dash_cnt + exponent_cnt != len) return false; + + // Needs at least 1 digit, 2 if exponent is present + if (digit_cnt < 1 + exponent_cnt) return false; + + return true; +} + +template +__global__ void detect_column_type_kernel(inference_options_view const options, + device_span const data, + ColumnStringIter column_strings_begin, + std::size_t const size, + cudf::io::column_type_histogram* column_info) +{ + auto idx = threadIdx.x + blockDim.x * blockIdx.x; + + while (idx < size) { + auto const [field_offset, field_len] = *(column_strings_begin + idx); + auto const field_begin = data.begin() + field_offset; + if (cudf::detail::serialized_trie_contains(options.trie_na, {field_begin, field_len})) { + atomicAdd(&column_info->null_count, 1); + continue; + } + + // No need to check strings since it's inferred in the tree generation + int digit_count = 0; + int decimal_count = 0; + int slash_count = 0; + int dash_count = 0; + int plus_count = 0; + int colon_count = 0; + int exponent_count = 0; + int other_count = 0; + + auto const maybe_hex = (field_len > 2 && *field_begin == '0' && *(field_begin + 1) == 'x') || + (field_len > 3 && *field_begin == '-' && *(field_begin + 1) == '0' && + *(field_begin + 2) == 'x'); + auto const field_end = field_begin + field_len; + + for (auto pos = field_begin; pos < field_end; ++pos) { + if (is_digit(*pos, maybe_hex)) { + digit_count++; + continue; + } + // Looking for unique characters that will help identify column types + switch (*pos) { + case '.': decimal_count++; break; + case '-': dash_count++; break; + case '+': plus_count++; break; + case '/': slash_count++; break; + case ':': colon_count++; break; + case 'e': + case 'E': + if (!maybe_hex && pos > field_begin && pos < field_end - 1) exponent_count++; + break; + default: other_count++; break; + } + } + + // Integers have to have the length of the string + int int_req_number_cnt = field_len; + // Off by one if they start with a minus sign + if ((*field_begin == '-' || *field_begin == '+') && field_len > 1) { --int_req_number_cnt; } + // Off by one if they are a hexadecimal number + if (maybe_hex) { --int_req_number_cnt; } + if (cudf::detail::serialized_trie_contains(options.trie_true, {field_begin, field_len}) || + cudf::detail::serialized_trie_contains(options.trie_false, {field_begin, field_len})) { + atomicAdd(&column_info->bool_count, 1); + } else if (digit_count == int_req_number_cnt) { + bool is_negative = (*field_begin == '-'); + char const* data_begin = field_begin + (is_negative || (*field_begin == '+')); + cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( + data_begin, data_begin + digit_count, is_negative, *column_info); + atomicAdd(ptr, 1); + } else if (is_like_float( + field_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) { + atomicAdd(&column_info->float_count, 1); + } + // A date field can have either one or two '-' or '\'; A legal combination will only have one + // of them To simplify the process of auto column detection, we are not covering all the + // date-time formation permutations + else if (((dash_count > 0 && dash_count <= 2 && slash_count == 0) || + (dash_count == 0 && slash_count > 0 && slash_count <= 2)) && + colon_count <= 2) { + atomicAdd(&column_info->datetime_count, 1); + } + + idx += gridDim.x + blockDim.x; + } // while +} + +template +cudf::io::column_type_histogram detect_column_type(inference_options_view const& options, + cudf::device_span data, + ColumnStringIter column_strings_begin, + std::size_t const size, + rmm::cuda_stream_view stream) +{ + constexpr int block_size = 128; + + auto const grid_size = (size + block_size - 1) / block_size; + auto d_column_info = rmm::device_scalar(stream); + CUDF_CUDA_TRY(cudaMemsetAsync( + d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value())); + + detect_column_type_kernel<<>>( + options, data, column_strings_begin, size, d_column_info.data()); + + return d_column_info.value(stream); +} + +template +cudf::data_type detect_data_type(inference_options_view const& options, + device_span data, + ColumnStringIter column_strings_begin, + std::size_t const size, + rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(size != 0, "No data available for data type inference.\n"); + + auto const h_column_info = detect_column_type(options, data, column_strings_begin, size, stream); + + auto get_type_id = [&](auto const& cinfo) { + auto int_count_total = + cinfo.big_int_count + cinfo.negative_small_int_count + cinfo.positive_small_int_count; + if (cinfo.null_count == static_cast(size)) { + // Entire column is NULL; allocate the smallest amount of memory + return type_id::INT8; + } else if (cinfo.string_count > 0) { + CUDF_FAIL("Unexpected string type in type inference."); + } else if (cinfo.datetime_count > 0) { + return type_id::TIMESTAMP_MILLISECONDS; + } else if (cinfo.float_count > 0 || (int_count_total > 0 && cinfo.null_count > 0)) { + return type_id::FLOAT64; + } else if (cinfo.big_int_count == 0 && int_count_total != 0) { + return type_id::INT64; + } else if (cinfo.big_int_count != 0 && cinfo.negative_small_int_count != 0) { + return type_id::STRING; + } else if (cinfo.big_int_count != 0) { + return type_id::UINT64; + } else if (cinfo.bool_count > 0) { + return type_id::BOOL8; + } else { + CUDF_FAIL("Data type detection failed.\n"); + } + }; + return cudf::data_type{get_type_id(h_column_info)}; +} +} // namespace detail +} // namespace fst +} // namespace io +} // namespace cudf diff --git a/cpp/tests/io/fst/type_inference_test.cu b/cpp/tests/io/fst/type_inference_test.cu new file mode 100644 index 00000000000..16ac0fd5211 --- /dev/null +++ b/cpp/tests/io/fst/type_inference_test.cu @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include + +using cudf::io::fst::detail::detect_data_type; +using cudf::io::fst::detail::inference_options; + +// Base test fixture for tests +struct TypeInference : public cudf::test::BaseFixture { +}; + +TEST_F(TypeInference, Basic) +{ + auto stream = rmm::cuda_stream_default; + auto options = inference_options{}; + + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = "[42,52,5]"; + rmm::device_uvector d_data{data.size(), stream}; + cudaMemcpyAsync( + d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + + std::size_t constexpr size = 3; + rmm::device_uvector> d_col_strings{size, stream}; + d_col_strings.set_element(0, {1, 2}, stream); + d_col_strings.set_element(1, {4, 2}, stream); + d_col_strings.set_element(2, {7, 1}, stream); + + auto res_type = detect_data_type(options.view(), d_data, d_col_strings.begin(), size, stream); + + EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT64}); +} From 35bcde7aa9ac1332c04195af60c902faccf08227 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 17 Jun 2022 15:57:41 -0400 Subject: [PATCH 029/173] Code formatting --- cpp/src/io/fst/agent_dfa.cuh | 2 +- cpp/src/io/fst/in_reg_array.cuh | 10 ++++------ cpp/tests/io/nested_json_test.cu | 3 +-- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index aaafd2d7a22..70f18b50f6c 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -183,7 +183,7 @@ struct StateTransitionOp { __host__ __device__ __forceinline__ void ReadSymbol(const CharIndexT& character_index, const SymbolIndexT& read_symbol_id) { - old_state_vector = state_vector; + old_state_vector = state_vector; state_vector.Set(0, transition_table(state_vector.Get(0), read_symbol_id)); callback_op.ReadSymbol(character_index, old_state_vector, state_vector, read_symbol_id); } diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh index 1180dc594da..db687d04181 100644 --- a/cpp/src/io/fst/in_reg_array.cuh +++ b/cpp/src/io/fst/in_reg_array.cuh @@ -61,9 +61,7 @@ class MultiFragmentInRegArray { //------------------------------------------------------------------------------ // HELPER FUNCTIONS //------------------------------------------------------------------------------ - CUDF_HOST_DEVICE uint32_t bfe(const uint32_t& data, - uint32_t bit_start, - uint32_t num_bits) const + CUDF_HOST_DEVICE uint32_t bfe(const uint32_t& data, uint32_t bit_start, uint32_t num_bits) const { #if CUB_PTX_ARCH > 0 return cub::BFE(data, bit_start, num_bits); @@ -74,9 +72,9 @@ class MultiFragmentInRegArray { } CUDF_HOST_DEVICE void bfi(uint32_t& data, - uint32_t bits, - uint32_t bit_start, - uint32_t num_bits) const + uint32_t bits, + uint32_t bit_start, + uint32_t num_bits) const { #if CUB_PTX_ARCH > 0 cub::BFI(data, data, bits, bit_start, num_bits); diff --git a/cpp/tests/io/nested_json_test.cu b/cpp/tests/io/nested_json_test.cu index 8b721ce2548..e11727f01c6 100644 --- a/cpp/tests/io/nested_json_test.cu +++ b/cpp/tests/io/nested_json_test.cu @@ -133,7 +133,6 @@ TEST_F(JsonTest, TokenStream) ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); - hostdevice_vector tokens_gpu{input.size(), stream}; hostdevice_vector token_indices_gpu{input.size(), stream}; hostdevice_vector num_tokens_out{single_item, stream}; @@ -182,7 +181,7 @@ TEST_F(JsonTest, TokenStream) // Verify the number of tokens matches ASSERT_EQ(golden_token_stream.size(), num_tokens_out[0]); - + for (std::size_t i = 0; i < num_tokens_out[0]; i++) { // Ensure the index the tokens are pointing to do match ASSERT_EQ(golden_token_stream[i].first, token_indices_gpu[i]); From ec856a7788f79d2da3d7ed5eb08b2b6a666a0908 Mon Sep 17 00:00:00 2001 From: Vukasin Date: Fri, 17 Jun 2022 15:27:59 -0700 Subject: [PATCH 030/173] rework API --- cpp/CMakeLists.txt | 1 + cpp/src/io/json/data_casting.cu | 23 ++++++++++++++++------- cpp/src/io/json/data_casting.cuh | 15 +++++++-------- cpp/tests/io/json_type_cast_test.cpp | 10 ++++++++-- 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index fc21c2def4d..c6826c9402f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -333,6 +333,7 @@ add_library( src/io/functions.cpp src/io/json/json_gpu.cu src/io/json/reader_impl.cu + src/io/json/data_casting.cu src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu src/io/orc/orc.cpp diff --git a/cpp/src/io/json/data_casting.cu b/cpp/src/io/json/data_casting.cu index 28e3bd7bfcf..17977d1c435 100644 --- a/cpp/src/io/json/data_casting.cu +++ b/cpp/src/io/json/data_casting.cu @@ -18,12 +18,21 @@ namespace cudf::io::json::experimental { -template -void parse_data(device_span str_data_ptrs, - device_span str_data_sizes, - host_span col_types, - std::vector cols, - rmm::cuda_stream_view stream) +template +std::vector> parse_data(str_spans_it_it cols_str_spans, + host_span cols_type, + rmm::cuda_stream_view stream) { + std::cout << cols_type.size() << std::endl; + for (auto& type : cols_type) { + if (type.id() != type_id::STRING) + std::cout << "NOT" << std::endl; + else + std::cout << "STRING" << std::endl; + } + return {}; } -} // namespace cudf::io::json::experimental \ No newline at end of file + +template std::vector> parse_data( + string_view** cols_str_spans, host_span cols_type, rmm::cuda_stream_view stream); +} // namespace cudf::io::json::experimental diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index fd632b20433..bbafa73f5ff 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -14,20 +14,19 @@ * limitations under the License. */ +#include #include -#include #include +#include #include namespace cudf::io::json::experimental { -template -void parse_data(device_span str_data_ptrs, - device_span str_data_sizes, - host_span col_types, - std::vector cols, - rmm::cuda_stream_view stream); +template +std::vector> parse_data(str_spans_it_it cols_str_spans, + host_span cols_type, + rmm::cuda_stream_view stream); -} \ No newline at end of file +} diff --git a/cpp/tests/io/json_type_cast_test.cpp b/cpp/tests/io/json_type_cast_test.cpp index de2f7c81280..45e192ea860 100644 --- a/cpp/tests/io/json_type_cast_test.cpp +++ b/cpp/tests/io/json_type_cast_test.cpp @@ -33,6 +33,12 @@ struct JSONTypeCastTest : public cudf::test::BaseFixture { }; -TEST_F(JSONTypeCastTest, Basic) { EXPECT_TRUE(1); } +TEST_F(JSONTypeCastTest, Basic) +{ + std::vector types{cudf::data_type{cudf::type_id::INT32}}; + cudf::io::json::experimental::parse_data( + {}, types, rmm::cuda_stream_default); + EXPECT_TRUE(0); +} -CUDF_TEST_PROGRAM_MAIN() \ No newline at end of file +CUDF_TEST_PROGRAM_MAIN() From bf24ef5a2d707ada75e4e00781fc5c81967925f4 Mon Sep 17 00:00:00 2001 From: Vukasin Date: Fri, 17 Jun 2022 16:37:11 -0700 Subject: [PATCH 031/173] tune up API; set up test inputs --- cpp/src/io/json/data_casting.cu | 17 +++---- cpp/src/io/json/data_casting.cuh | 3 +- cpp/tests/CMakeLists.txt | 2 +- cpp/tests/io/json_type_cast_test.cpp | 44 ----------------- cpp/tests/io/json_type_cast_test.cu | 70 ++++++++++++++++++++++++++++ 5 files changed, 80 insertions(+), 56 deletions(-) delete mode 100644 cpp/tests/io/json_type_cast_test.cpp create mode 100644 cpp/tests/io/json_type_cast_test.cu diff --git a/cpp/src/io/json/data_casting.cu b/cpp/src/io/json/data_casting.cu index 17977d1c435..27b52b6d6fa 100644 --- a/cpp/src/io/json/data_casting.cu +++ b/cpp/src/io/json/data_casting.cu @@ -18,21 +18,18 @@ namespace cudf::io::json::experimental { -template +template std::vector> parse_data(str_spans_it_it cols_str_spans, + col_size_it cols_size, host_span cols_type, rmm::cuda_stream_view stream) { - std::cout << cols_type.size() << std::endl; - for (auto& type : cols_type) { - if (type.id() != type_id::STRING) - std::cout << "NOT" << std::endl; - else - std::cout << "STRING" << std::endl; - } return {}; } -template std::vector> parse_data( - string_view** cols_str_spans, host_span cols_type, rmm::cuda_stream_view stream); +template std::vector> parse_data( + string_view** cols_str_spans, + size_type* cols_size, + host_span cols_type, + rmm::cuda_stream_view stream); } // namespace cudf::io::json::experimental diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index bbafa73f5ff..95d37c8ba48 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -24,8 +24,9 @@ namespace cudf::io::json::experimental { -template +template std::vector> parse_data(str_spans_it_it cols_str_spans, + col_size_it cols_size, host_span cols_type, rmm::cuda_stream_view stream); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index f3b9db1b114..69aa9a7c413 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -222,7 +222,7 @@ ConfigureTest(FILE_IO_TEST io/file_io_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp) ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp) -ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cpp) +ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) if(CUDF_ENABLE_ARROW_S3) diff --git a/cpp/tests/io/json_type_cast_test.cpp b/cpp/tests/io/json_type_cast_test.cpp deleted file mode 100644 index 45e192ea860..00000000000 --- a/cpp/tests/io/json_type_cast_test.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -struct JSONTypeCastTest : public cudf::test::BaseFixture { -}; - -TEST_F(JSONTypeCastTest, Basic) -{ - std::vector types{cudf::data_type{cudf::type_id::INT32}}; - cudf::io::json::experimental::parse_data( - {}, types, rmm::cuda_stream_default); - EXPECT_TRUE(0); -} - -CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu new file mode 100644 index 00000000000..375dd0c0136 --- /dev/null +++ b/cpp/tests/io/json_type_cast_test.cu @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct JSONTypeCastTest : public cudf::test::BaseFixture { +}; + +namespace { +using string_pair = thrust::pair; +struct string_view_only { + __device__ cudf::string_view operator()(thrust::pair const& p) + { + return p.first; + } +}; +} // namespace + +TEST_F(JSONTypeCastTest, RealBasic) +{ + auto const stream = rmm::cuda_stream_default; + std::vector types{cudf::data_type{cudf::type_id::INT32}}; + + cudf::test::strings_column_wrapper data({"this", "is", "a", "column", "of", "strings"}); + auto d_column = cudf::column_device_view::create(data); + rmm::device_uvector svs(d_column->size(), rmm::cuda_stream_default); + thrust::transform(thrust::device, + d_column->pair_begin(), + d_column->pair_end(), + svs.data(), + string_view_only{}); + + std::vector str_spans{svs.data()}; + auto d_str_spans = cudf::detail::make_device_uvector_async(str_spans, stream); + std::vector col_size{(cudf::size_type)svs.size()}; + auto d_col_size = cudf::detail::make_device_uvector_async(col_size, stream); + + cudf::io::json::experimental::parse_data( + d_str_spans.data(), d_col_size.data(), types, rmm::cuda_stream_default); +} + +CUDF_TEST_PROGRAM_MAIN() From 54cda7337c564ec751d0fd38a730bd2c046e1aae Mon Sep 17 00:00:00 2001 From: Vukasin Date: Fri, 17 Jun 2022 16:41:42 -0700 Subject: [PATCH 032/173] comments --- cpp/src/io/json/data_casting.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/io/json/data_casting.cu b/cpp/src/io/json/data_casting.cu index 27b52b6d6fa..9766ad24369 100644 --- a/cpp/src/io/json/data_casting.cu +++ b/cpp/src/io/json/data_casting.cu @@ -24,6 +24,8 @@ std::vector> parse_data(str_spans_it_it cols_str_spans, host_span cols_type, rmm::cuda_stream_view stream) { + // first version: make_strings_column from the string spans + // full version: use existing code (`ConvertFunctor`) to convert values return {}; } From 9f2247f4d3e4a035b141251ea4cf4f1b6a6ef8ec Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 22 Jul 2022 11:51:44 -0700 Subject: [PATCH 033/173] add placeholder experimental JSON reader --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/io/json.hpp | 28 +++++++++++++++++ cpp/src/io/json/experimental/read_json.cpp | 31 +++++++++++++++++++ cpp/src/io/json/experimental/read_json.hpp | 36 ++++++++++++++++++++++ cpp/src/io/json/reader_impl.cu | 6 ++++ python/cudf/cudf/_lib/cpp/io/json.pxd | 5 +++ python/cudf/cudf/_lib/json.pyx | 4 ++- python/cudf/cudf/io/json.py | 9 ++++-- python/cudf/cudf/utils/ioutils.py | 2 +- 9 files changed, 118 insertions(+), 4 deletions(-) create mode 100644 cpp/src/io/json/experimental/read_json.cpp create mode 100644 cpp/src/io/json/experimental/read_json.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4819d1c2f5c..104e731c470 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -322,6 +322,7 @@ add_library( src/io/functions.cpp src/io/json/json_gpu.cu src/io/json/reader_impl.cu + src/io/json/experimental/read_json.cpp src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu src/io/orc/orc.cpp diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 9ccb5ec4d58..01334060063 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -80,6 +80,9 @@ class json_reader_options { // Whether to parse dates as DD/MM versus MM/DD bool _dayfirst = false; + // Whether to parse dates as DD/MM versus MM/DD + bool _experimental = false; + /** * @brief Constructor from source info. * @@ -193,6 +196,13 @@ class json_reader_options { */ bool is_enabled_dayfirst() const { return _dayfirst; } + /** + * @brief Whether the experimental reader should be used. + * + * @returns true if the experimental reader will be used, false otherwise + */ + bool is_enabled_experimental() const { return _experimental; } + /** * @brief Set data types for columns to be read. * @@ -241,6 +251,13 @@ class json_reader_options { * @param val Boolean value to enable/disable day first parsing format */ void enable_dayfirst(bool val) { _dayfirst = val; } + + /** + * @brief Set whether to use the experimental reader. + * + * @param val Boolean value to enable/disable the experimental readers + */ + void enable_experimental(bool val) { _experimental = val; } }; /** @@ -347,6 +364,17 @@ class json_reader_options_builder { options._dayfirst = val; return *this; } + /** + * @brief Set whether to use the experimental reader. + * + * @param val Boolean value to enable/disable experimental parsing + * @return this for chaining + */ + json_reader_options_builder& experimental(bool val) + { + options._experimental = val; + return *this; + } /** * @brief move json_reader_options member once it's built. diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp new file mode 100644 index 00000000000..fef5aa7d794 --- /dev/null +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "read_json.hpp" + +#include + +namespace cudf::io::detail::json::experimental { + +table_with_metadata read_json(std::vector>& sources, + json_reader_options const& reader_opts, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FAIL("Not implemented"); +} + +} // namespace cudf::io::detail::json::experimental diff --git a/cpp/src/io/json/experimental/read_json.hpp b/cpp/src/io/json/experimental/read_json.hpp new file mode 100644 index 00000000000..9c39315da30 --- /dev/null +++ b/cpp/src/io/json/experimental/read_json.hpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::io::detail::json::experimental { + +table_with_metadata read_json(std::vector>& sources, + json_reader_options const& reader_opts, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +} diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 052c51351a1..7e6be190acb 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -16,6 +16,8 @@ #include "json_gpu.hpp" +#include "experimental/read_json.hpp" + #include #include @@ -571,6 +573,10 @@ table_with_metadata read_json(std::vector>& sources, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + if (reader_opts.is_enabled_experimental()) { + return experimental::read_json(sources, reader_opts, stream, mr); + } + CUDF_EXPECTS(not sources.empty(), "No sources were defined"); CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index 2c65e329bb0..6e240d00349 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -24,6 +24,7 @@ cdef extern from "cudf/io/json.hpp" \ size_type get_byte_range_size() except+ bool is_enabled_lines() except+ bool is_enabled_dayfirst() except+ + bool is_enabled_experimental() except+ # setter void set_dtypes(vector[data_type] types) except+ @@ -35,6 +36,7 @@ cdef extern from "cudf/io/json.hpp" \ void set_byte_range_size(size_type size) except+ void enable_lines(bool val) except+ void enable_dayfirst(bool val) except+ + void enable_experimental(bool val) except+ @staticmethod json_reader_options_builder builder( @@ -70,6 +72,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& dayfirst( bool val ) except+ + json_reader_options_builder& experimental( + bool val + ) except+ json_reader_options build() except+ diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 263d70afe26..89057e61b6b 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -31,7 +31,8 @@ cpdef read_json(object filepaths_or_buffers, object dtype, bool lines, object compression, - object byte_range): + object byte_range, + bool experimental): """ Cython function to call into libcudf API, see `read_json`. @@ -98,6 +99,7 @@ cpdef read_json(object filepaths_or_buffers, .lines(c_lines) .byte_range_offset(c_range_offset) .byte_range_size(c_range_size) + .experimental(experimental) .build() ) if is_list_like_dtypes: diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 869e055decf..f7c5c36edc5 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -27,7 +27,7 @@ def read_json( raise ValueError("cudf engine only supports JSON Lines format") if engine == "auto": engine = "cudf" if lines else "pandas" - if engine == "cudf": + if engine == "cudf" or engine == "cudf_experimental": # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(path_or_buf): @@ -56,7 +56,12 @@ def read_json( return cudf.DataFrame._from_data( *libjson.read_json( - filepaths_or_buffers, dtype, lines, compression, byte_range + filepaths_or_buffers, + dtype, + lines, + compression, + byte_range, + engine == "cudf_experimental", ) ) else: diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 3771587eb47..d3c41de842a 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -463,7 +463,7 @@ function or `StringIO`). Multiple inputs may be provided as a list. If a list is specified each list entry may be of a different input type as long as each input is of a valid type and all input JSON schema(s) match. -engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto' +engine : {{ 'auto', 'cudf', 'cudf_experimental', 'pandas' }}, default 'auto' Parser engine to use. If 'auto' is passed, the engine will be automatically selected based on the other parameters. orient : string, From 76b283475bb2ded9622c8e9f1cae63a562db969b Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 22 Jul 2022 11:58:07 -0700 Subject: [PATCH 034/173] doc fix --- cpp/include/cudf/io/json.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 01334060063..72d5fc9c4a6 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -80,7 +80,7 @@ class json_reader_options { // Whether to parse dates as DD/MM versus MM/DD bool _dayfirst = false; - // Whether to parse dates as DD/MM versus MM/DD + // Whether to use the experimental reader bool _experimental = false; /** @@ -255,7 +255,7 @@ class json_reader_options { /** * @brief Set whether to use the experimental reader. * - * @param val Boolean value to enable/disable the experimental readers + * @param val Boolean value to enable/disable the experimental reader */ void enable_experimental(bool val) { _experimental = val; } }; From f5464f654f606566ca3701cbf9ec949cf4c1e6ce Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 22 Jul 2022 12:10:15 -0700 Subject: [PATCH 035/173] copyright year --- python/cudf/cudf/_lib/cpp/io/json.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index 6e240d00349..bc9d87a5cbf 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint8_t from libcpp cimport bool From 2ca0ac0442c26a10ccce4e4bef42abaec016c0d1 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 25 Jul 2022 13:27:38 -0700 Subject: [PATCH 036/173] newline Co-authored-by: Bradley Dice --- cpp/include/cudf/io/json.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 72d5fc9c4a6..73724b99589 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -364,6 +364,7 @@ class json_reader_options_builder { options._dayfirst = val; return *this; } + /** * @brief Set whether to use the experimental reader. * From 3ee7a5accaccc005445f7564c4d03df97eebb4d1 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 25 Jul 2022 13:32:45 -0700 Subject: [PATCH 037/173] use span --- cpp/src/io/json/experimental/read_json.cpp | 2 +- cpp/src/io/json/experimental/read_json.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index fef5aa7d794..146eaf203e4 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -20,7 +20,7 @@ namespace cudf::io::detail::json::experimental { -table_with_metadata read_json(std::vector>& sources, +table_with_metadata read_json(host_span> sources, json_reader_options const& reader_opts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/io/json/experimental/read_json.hpp b/cpp/src/io/json/experimental/read_json.hpp index 9c39315da30..c9f74b2cc41 100644 --- a/cpp/src/io/json/experimental/read_json.hpp +++ b/cpp/src/io/json/experimental/read_json.hpp @@ -19,16 +19,16 @@ #include #include #include +#include #include #include #include -#include namespace cudf::io::detail::json::experimental { -table_with_metadata read_json(std::vector>& sources, +table_with_metadata read_json(host_span> sources, json_reader_options const& reader_opts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); From fcc90c5a3a390165daea1f19d588f8c2134a7c55 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 26 Jul 2022 19:08:19 -0700 Subject: [PATCH 038/173] options check + decompression --- cpp/include/cudf/io/types.hpp | 1 + cpp/src/io/json/experimental/read_json.cpp | 46 +++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index e9a93894f7d..7520ca107cc 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -21,6 +21,7 @@ #pragma once +#include #include #include diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index 146eaf203e4..fbe9b5f6112 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -17,15 +17,59 @@ #include "read_json.hpp" #include +#include namespace cudf::io::detail::json::experimental { +table_with_metadata read_nested_json(host_span input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FAIL("Not implemented"); +} + +std::vector ingest_raw_input(host_span> sources, + compression_type compression) +{ + // Iterate through the user defined sources and read the contents into the local buffer + size_t total_source_size = 0; + for (const auto& source : sources) { + total_source_size += source->size(); + } + + auto buffer = std::vector(total_source_size); + + size_t bytes_read = 0; + for (const auto& source : sources) { + if (not source->is_empty()) { + auto const destination = buffer.data() + bytes_read; + bytes_read += source->host_read(0, source->size(), destination); + } + } + + if (compression == compression_type::NONE) { + return buffer; + } else { + return decompress(compression, buffer); + } +} + table_with_metadata read_json(host_span> sources, json_reader_options const& reader_opts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_FAIL("Not implemented"); + auto const dtypes_empty = + std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); + CUDF_EXPECTS(dtypes_empty, "user specified dtypes are not yet supported"); + CUDF_EXPECTS(not reader_opts.is_enabled_lines(), "JSON Lines format is not yet supported"); + CUDF_EXPECTS(reader_opts.get_byte_range_offset() == 0 and reader_opts.get_byte_range_size() == 0, + "specifying a byte range is not yet supported"); + + auto const buffer = ingest_raw_input(sources, reader_opts.get_compression()); + auto data = host_span(reinterpret_cast(buffer.data()), buffer.size()); + + return read_nested_json(data, stream, mr); } } // namespace cudf::io::detail::json::experimental From 83c12d5fe5fedf950e4f5610f4ec20ddfe6df9d0 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 9 Aug 2022 14:17:34 -0700 Subject: [PATCH 039/173] basic string column creation --- cpp/src/io/json/data_casting.cu | 16 --------------- cpp/src/io/json/data_casting.cuh | 30 +++++++++++++++++++++++------ cpp/tests/io/json_type_cast_test.cu | 25 ++++++++++-------------- 3 files changed, 34 insertions(+), 37 deletions(-) diff --git a/cpp/src/io/json/data_casting.cu b/cpp/src/io/json/data_casting.cu index 9766ad24369..1c2cae2c15b 100644 --- a/cpp/src/io/json/data_casting.cu +++ b/cpp/src/io/json/data_casting.cu @@ -18,20 +18,4 @@ namespace cudf::io::json::experimental { -template -std::vector> parse_data(str_spans_it_it cols_str_spans, - col_size_it cols_size, - host_span cols_type, - rmm::cuda_stream_view stream) -{ - // first version: make_strings_column from the string spans - // full version: use existing code (`ConvertFunctor`) to convert values - return {}; -} - -template std::vector> parse_data( - string_view** cols_str_spans, - size_type* cols_size, - host_span cols_type, - rmm::cuda_stream_view stream); } // namespace cudf::io::json::experimental diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index 95d37c8ba48..4daa598a338 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -18,16 +18,34 @@ #include #include +#include #include -#include namespace cudf::io::json::experimental { -template -std::vector> parse_data(str_spans_it_it cols_str_spans, - col_size_it cols_size, - host_span cols_type, - rmm::cuda_stream_view stream); +template +rmm::device_uvector> coalesce_input( + str_tuple_it str_tuples, size_type col_size, rmm::cuda_stream_view stream) +{ + auto result = rmm::device_uvector>(col_size, stream); + thrust::copy_n(rmm::exec_policy(stream), str_tuples, col_size, result.begin()); + return result; +} +template +std::unique_ptr parse_data(str_tuple_it str_tuples, + size_type col_size, + data_type col_type, + rmm::cuda_stream_view stream) +{ + if (col_type == cudf::data_type{cudf::type_id::STRING}) { + auto const strings_span = coalesce_input(str_tuples, col_size, stream); + return make_strings_column(strings_span, stream); + } else { + CUDF_FAIL("Type conversion not implemented"); + // full version: use existing code (`ConvertFunctor`) to convert values + } } + +} // namespace cudf::io::json::experimental diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 375dd0c0136..5d7b2d43faf 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -35,11 +35,11 @@ struct JSONTypeCastTest : public cudf::test::BaseFixture { }; namespace { -using string_pair = thrust::pair; -struct string_view_only { - __device__ cudf::string_view operator()(thrust::pair const& p) +struct to_thrust_pair_fn { + __device__ thrust::pair operator()( + thrust::pair const& p) { - return p.first; + return {p.first.data(), p.first.size_bytes()}; } }; } // namespace @@ -47,24 +47,19 @@ struct string_view_only { TEST_F(JSONTypeCastTest, RealBasic) { auto const stream = rmm::cuda_stream_default; - std::vector types{cudf::data_type{cudf::type_id::INT32}}; + auto const type = cudf::data_type{cudf::type_id::STRING}; cudf::test::strings_column_wrapper data({"this", "is", "a", "column", "of", "strings"}); auto d_column = cudf::column_device_view::create(data); - rmm::device_uvector svs(d_column->size(), rmm::cuda_stream_default); + rmm::device_uvector> svs(d_column->size(), stream); thrust::transform(thrust::device, d_column->pair_begin(), d_column->pair_end(), - svs.data(), - string_view_only{}); + svs.begin(), + to_thrust_pair_fn{}); - std::vector str_spans{svs.data()}; - auto d_str_spans = cudf::detail::make_device_uvector_async(str_spans, stream); - std::vector col_size{(cudf::size_type)svs.size()}; - auto d_col_size = cudf::detail::make_device_uvector_async(col_size, stream); - - cudf::io::json::experimental::parse_data( - d_str_spans.data(), d_col_size.data(), types, rmm::cuda_stream_default); + auto column = cudf::io::json::experimental::parse_data(svs.data(), svs.size(), type, stream); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(column->view(), data); } CUDF_TEST_PROGRAM_MAIN() From 84210c8a6f96a5d1f8c0c7d14be1d3ac01e0f47b Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 10 Aug 2022 11:32:34 -0700 Subject: [PATCH 040/173] basic conversion --- cpp/src/io/json/data_casting.cuh | 211 +++++++++++++++++++++++++++- cpp/tests/io/json_type_cast_test.cu | 30 +++- 2 files changed, 233 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index 4daa598a338..049327107ad 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -14,7 +14,11 @@ * limitations under the License. */ + #include + #include +#include +#include #include #include @@ -22,8 +26,179 @@ #include + namespace cudf::io::json::experimental { +/** + * @brief Decodes a numeric value base on templated cudf type T with specified + * base. + * + * @param[in] begin Beginning of the character string + * @param[in] end End of the character string + * @param opts The global parsing behavior options + * + * @return The parsed numeric value + */ + template + __inline__ __device__ T decode_value(const char* begin, + uint64_t end, + parse_options_view const& opts) + { + return cudf::io::parse_numeric(begin, end, opts); + } + + /** + * @brief Decodes a numeric value base on templated cudf type T + * + * @param[in] begin Beginning of the character string + * @param[in] end End of the character string + * @param opts The global parsing behavior options + * + * @return The parsed numeric value + */ + template () and !cudf::is_duration()>* = nullptr> + __inline__ __device__ T decode_value(const char* begin, + const char* end, + parse_options_view const& opts) + { + return cudf::io::parse_numeric(begin, end, opts); + } + + template ()>* = nullptr> + __inline__ __device__ T decode_value(char const* begin, + char const* end, + parse_options_view const& opts) + { + return to_timestamp(begin, end, opts.dayfirst); + } + + template ()>* = nullptr> + __inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&) + { + return to_duration(begin, end); + } + + // The purpose of these is merely to allow compilation ONLY + template <> + __inline__ __device__ cudf::string_view decode_value(const char*, + const char*, + parse_options_view const&) + { + return cudf::string_view{}; + } + + template <> + __inline__ __device__ cudf::dictionary32 decode_value(const char*, + const char*, + parse_options_view const&) + { + return cudf::dictionary32{}; + } + + template <> + __inline__ __device__ cudf::list_view decode_value(const char*, + const char*, + parse_options_view const&) + { + return cudf::list_view{}; + } + template <> + __inline__ __device__ cudf::struct_view decode_value(const char*, + const char*, + parse_options_view const&) + { + return cudf::struct_view{}; + } + + template <> + __inline__ __device__ numeric::decimal32 decode_value(const char*, + const char*, + parse_options_view const&) + { + return numeric::decimal32{}; + } + + template <> + __inline__ __device__ numeric::decimal64 decode_value(const char*, + const char*, + parse_options_view const&) + { + return numeric::decimal64{}; + } + + template <> + __inline__ __device__ numeric::decimal128 decode_value(const char*, + const char*, + parse_options_view const&) + { + return numeric::decimal128{}; + } + + struct ConvertFunctor { + /** + * @brief Template specialization for operator() for types whose values can be + * convertible to a 0 or 1 to represent false/true. The converting is done by + * checking against the default and user-specified true/false values list. + * + * It is handled here rather than within convertStrToValue() as that function + * is used by other types (ex. timestamp) that aren't 'booleable'. + */ + template >* = nullptr> + __host__ __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* output_column, + cudf::size_type row, + const parse_options_view& opts) + { + T& value{static_cast(output_column)[row]}; + + value = [&opts, end, begin]() -> T { + // Check for user-specified true/false values + auto const len = static_cast(end - begin); + if (serialized_trie_contains(opts.trie_true, {begin, len})) { return 1; } + if (serialized_trie_contains(opts.trie_false, {begin, len})) { return 0; } + return decode_value(begin, end, opts); + }(); + + return true; + } + + /** + * @brief Dispatch for floating points, which are set to NaN if the input + * is not valid. In such case, the validity mask is set to zero too. + */ + template >* = nullptr> + __host__ __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, + size_t row, + parse_options_view const& opts) + { + T const value = decode_value(begin, end, opts); + static_cast(out_buffer)[row] = value; + + return !std::isnan(value); + } + + /** + * @brief Default template operator() dispatch specialization all data types + * (including wrapper types) that is not covered by above. + */ + template and !std::is_integral_v>* = nullptr> + __host__ __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* output_column, + cudf::size_type row, + const parse_options_view& opts) + { + static_cast(output_column)[row] = decode_value(begin, end, opts); + + return true; + } + }; + template rmm::device_uvector> coalesce_input( str_tuple_it str_tuples, size_type col_size, rmm::cuda_stream_view stream) @@ -33,19 +208,45 @@ rmm::device_uvector> coalesce_input( return result; } -template +template std::unique_ptr parse_data(str_tuple_it str_tuples, size_type col_size, data_type col_type, - rmm::cuda_stream_view stream) + B&& null_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { + auto parse_opts = parse_options{',', '\n', '\"', '.'}; + + parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + if (col_type == cudf::data_type{cudf::type_id::STRING}) { auto const strings_span = coalesce_input(str_tuples, col_size, stream); return make_strings_column(strings_span, stream); - } else { - CUDF_FAIL("Type conversion not implemented"); - // full version: use existing code (`ConvertFunctor`) to convert values } + + auto out_col = make_fixed_width_column( + col_type, col_size, std::move(null_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); + auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream); + + // use existing code (`ConvertFunctor`) to convert values + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, col = *output_dv_ptr, opts = parse_opts.view()] __device__(size_type row_idx) { + auto const in = str_tuples[row_idx]; + cudf::type_dispatcher(column_types[desc.column], + ConvertFunctor{}, + in.first, + in.first + in.second, + col.data(), + row_idx, + opts) + }); + + return out_col; } } // namespace cudf::io::json::experimental diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 5d7b2d43faf..09cccb7f30f 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -44,9 +44,10 @@ struct to_thrust_pair_fn { }; } // namespace -TEST_F(JSONTypeCastTest, RealBasic) +TEST_F(JSONTypeCastTest, String) { auto const stream = rmm::cuda_stream_default; + auto mr = rmm::mr::get_current_device_resource(); auto const type = cudf::data_type{cudf::type_id::STRING}; cudf::test::strings_column_wrapper data({"this", "is", "a", "column", "of", "strings"}); @@ -58,8 +59,31 @@ TEST_F(JSONTypeCastTest, RealBasic) svs.begin(), to_thrust_pair_fn{}); - auto column = cudf::io::json::experimental::parse_data(svs.data(), svs.size(), type, stream); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(column->view(), data); + auto str_col = cudf::io::json::experimental::parse_data( + svs.data(), svs.size(), type, rmm::device_buffer{0, stream}, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(str_col->view(), data); +} + +TEST_F(JSONTypeCastTest, Int) +{ + auto const stream = rmm::cuda_stream_default; + auto mr = rmm::mr::get_current_device_resource(); + auto const type = cudf::data_type{cudf::type_id::INT32}; + + cudf::test::strings_column_wrapper data({"1", "2", "3", "4", "5", "6"}); + auto d_column = cudf::column_device_view::create(data); + rmm::device_uvector> svs(d_column->size(), stream); + thrust::transform(thrust::device, + d_column->pair_begin(), + d_column->pair_end(), + svs.begin(), + to_thrust_pair_fn{}); + + auto col = cudf::io::json::experimental::parse_data( + svs.data(), svs.size(), type, rmm::device_buffer{0, stream}, stream, mr); + + auto expected = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 4, 5, 6}}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected); } CUDF_TEST_PROGRAM_MAIN() From 1db20e1bf0503befdb6c14ed624b0a1f50069c48 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 10 Aug 2022 12:57:24 -0700 Subject: [PATCH 041/173] remove duplicated `decode_value`s --- cpp/src/io/csv/csv_gpu.cu | 75 ---------- cpp/src/io/csv/datetime.cuh | 30 ++++ cpp/src/io/json/data_casting.cuh | 196 ++----------------------- cpp/src/io/json/json_gpu.cu | 176 ---------------------- cpp/src/io/utilities/parsing_utils.cuh | 185 ++++++++++++++++++++--- 5 files changed, 206 insertions(+), 456 deletions(-) diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 55169e335cc..8ee92b1462c 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -16,18 +16,15 @@ #include "csv_common.hpp" #include "csv_gpu.hpp" -#include "datetime.cuh" #include #include #include #include -#include #include #include #include -#include #include #include #include @@ -293,78 +290,6 @@ __global__ void __launch_bounds__(csvparse_block_dim) } } -template -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return cudf::io::parse_numeric(begin, end, opts); -} - -template () and !cudf::is_duration()>* = nullptr> -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return cudf::io::parse_numeric(begin, end, opts); -} - -template ()>* = nullptr> -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return to_timestamp(begin, end, opts.dayfirst); -} - -template ()>* = nullptr> -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return to_duration(begin, end); -} - -// The purpose of this is merely to allow compilation ONLY -// TODO : make this work for csv -template <> -__inline__ __device__ cudf::string_view decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return cudf::string_view{}; -} - -// The purpose of this is merely to allow compilation ONLY -template <> -__inline__ __device__ cudf::dictionary32 decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return cudf::dictionary32{}; -} - -// The purpose of this is merely to allow compilation ONLY -// TODO : make this work for csv -template <> -__inline__ __device__ cudf::list_view decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return cudf::list_view{}; -} - -// The purpose of this is merely to allow compilation ONLY -// TODO : make this work for csv -template <> -__inline__ __device__ cudf::struct_view decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return cudf::struct_view{}; -} - /** * @brief Functor for converting CSV raw data to typed value. */ diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh index 082674794fa..28ec3ec8895 100644 --- a/cpp/src/io/csv/datetime.cuh +++ b/cpp/src/io/csv/datetime.cuh @@ -21,6 +21,7 @@ #include +#include #include #include #include @@ -304,6 +305,35 @@ __inline__ __device__ T parse_optional_integer(char const** begin, char const* e return parse_integer(begin, end); } +/** + * @brief Finds the first element after the leading space characters. + * + * @param begin Pointer to the first element of the string + * @param end Pointer to the first element after the string + */ +__inline__ __device__ auto skip_spaces(char const* begin, char const* end) +{ + return thrust::find_if(thrust::seq, begin, end, [](auto elem) { return elem != ' '; }); +} + +/** + * @brief Excludes the prefix from the input range if the string starts with the prefix. + * + * @tparam N length on the prefix, plus one + * @param[in, out] begin Pointer to the first element of the string + * @param end Pointer to the first element after the string + * @param prefix String we're searching for at the start of the input range + */ +template +__inline__ __device__ auto skip_if_starts_with(char const* begin, + char const* end, + const char (&prefix)[N]) +{ + static constexpr size_t prefix_len = N - 1; + if (end - begin < prefix_len) return begin; + return thrust::equal(thrust::seq, begin, begin + prefix_len, prefix) ? begin + prefix_len : begin; +} + /** * @brief Parses the input string into a duration of `duration_type`. * diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index 049327107ad..de1232bb84d 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -14,7 +14,7 @@ * limitations under the License. */ - #include +#include #include #include @@ -26,179 +26,8 @@ #include - namespace cudf::io::json::experimental { -/** - * @brief Decodes a numeric value base on templated cudf type T with specified - * base. - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed numeric value - */ - template - __inline__ __device__ T decode_value(const char* begin, - uint64_t end, - parse_options_view const& opts) - { - return cudf::io::parse_numeric(begin, end, opts); - } - - /** - * @brief Decodes a numeric value base on templated cudf type T - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed numeric value - */ - template () and !cudf::is_duration()>* = nullptr> - __inline__ __device__ T decode_value(const char* begin, - const char* end, - parse_options_view const& opts) - { - return cudf::io::parse_numeric(begin, end, opts); - } - - template ()>* = nullptr> - __inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) - { - return to_timestamp(begin, end, opts.dayfirst); - } - - template ()>* = nullptr> - __inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&) - { - return to_duration(begin, end); - } - - // The purpose of these is merely to allow compilation ONLY - template <> - __inline__ __device__ cudf::string_view decode_value(const char*, - const char*, - parse_options_view const&) - { - return cudf::string_view{}; - } - - template <> - __inline__ __device__ cudf::dictionary32 decode_value(const char*, - const char*, - parse_options_view const&) - { - return cudf::dictionary32{}; - } - - template <> - __inline__ __device__ cudf::list_view decode_value(const char*, - const char*, - parse_options_view const&) - { - return cudf::list_view{}; - } - template <> - __inline__ __device__ cudf::struct_view decode_value(const char*, - const char*, - parse_options_view const&) - { - return cudf::struct_view{}; - } - - template <> - __inline__ __device__ numeric::decimal32 decode_value(const char*, - const char*, - parse_options_view const&) - { - return numeric::decimal32{}; - } - - template <> - __inline__ __device__ numeric::decimal64 decode_value(const char*, - const char*, - parse_options_view const&) - { - return numeric::decimal64{}; - } - - template <> - __inline__ __device__ numeric::decimal128 decode_value(const char*, - const char*, - parse_options_view const&) - { - return numeric::decimal128{}; - } - - struct ConvertFunctor { - /** - * @brief Template specialization for operator() for types whose values can be - * convertible to a 0 or 1 to represent false/true. The converting is done by - * checking against the default and user-specified true/false values list. - * - * It is handled here rather than within convertStrToValue() as that function - * is used by other types (ex. timestamp) that aren't 'booleable'. - */ - template >* = nullptr> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* output_column, - cudf::size_type row, - const parse_options_view& opts) - { - T& value{static_cast(output_column)[row]}; - - value = [&opts, end, begin]() -> T { - // Check for user-specified true/false values - auto const len = static_cast(end - begin); - if (serialized_trie_contains(opts.trie_true, {begin, len})) { return 1; } - if (serialized_trie_contains(opts.trie_false, {begin, len})) { return 0; } - return decode_value(begin, end, opts); - }(); - - return true; - } - - /** - * @brief Dispatch for floating points, which are set to NaN if the input - * is not valid. In such case, the validity mask is set to zero too. - */ - template >* = nullptr> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* out_buffer, - size_t row, - parse_options_view const& opts) - { - T const value = decode_value(begin, end, opts); - static_cast(out_buffer)[row] = value; - - return !std::isnan(value); - } - - /** - * @brief Default template operator() dispatch specialization all data types - * (including wrapper types) that is not covered by above. - */ - template and !std::is_integral_v>* = nullptr> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* output_column, - cudf::size_type row, - const parse_options_view& opts) - { - static_cast(output_column)[row] = decode_value(begin, end, opts); - - return true; - } - }; - template rmm::device_uvector> coalesce_input( str_tuple_it str_tuples, size_type col_size, rmm::cuda_stream_view stream) @@ -232,19 +61,16 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream); // use existing code (`ConvertFunctor`) to convert values - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, col = *output_dv_ptr, opts = parse_opts.view()] __device__(size_type row_idx) { - auto const in = str_tuples[row_idx]; - cudf::type_dispatcher(column_types[desc.column], - ConvertFunctor{}, - in.first, - in.first + in.second, - col.data(), - row_idx, - opts) - }); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, col = *output_dv_ptr, opts = parse_opts.view(), col_type] __device__( + size_type row_idx) mutable { + auto const in = str_tuples[row_idx]; + cudf::type_dispatcher( + col_type, ConvertFunctor{}, in.first, in.first + in.second, col.data(), row_idx, opts); + }); return out_col; } diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index 7d37fdf4868..a3e8c5e3a7b 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -16,14 +16,11 @@ #include "json_gpu.hpp" -#include #include #include #include #include -#include -#include #include #include #include @@ -105,179 +102,6 @@ __device__ std::pair get_next_key(char const* begin, return {key_begin, key_end_pair.second}; } -/** - * @brief Decodes a numeric value base on templated cudf type T with specified - * base. - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed numeric value - */ -template -__inline__ __device__ T decode_value(const char* begin, - uint64_t end, - parse_options_view const& opts) -{ - return cudf::io::parse_numeric(begin, end, opts); -} - -/** - * @brief Decodes a numeric value base on templated cudf type T - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed numeric value - */ -template () and !cudf::is_duration()>* = nullptr> -__inline__ __device__ T decode_value(const char* begin, - const char* end, - parse_options_view const& opts) -{ - return cudf::io::parse_numeric(begin, end, opts); -} - -template ()>* = nullptr> -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return to_timestamp(begin, end, opts.dayfirst); -} - -template ()>* = nullptr> -__inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&) -{ - return to_duration(begin, end); -} - -// The purpose of these is merely to allow compilation ONLY -template <> -__inline__ __device__ cudf::string_view decode_value(const char*, - const char*, - parse_options_view const&) -{ - return cudf::string_view{}; -} - -template <> -__inline__ __device__ cudf::dictionary32 decode_value(const char*, - const char*, - parse_options_view const&) -{ - return cudf::dictionary32{}; -} - -template <> -__inline__ __device__ cudf::list_view decode_value(const char*, - const char*, - parse_options_view const&) -{ - return cudf::list_view{}; -} -template <> -__inline__ __device__ cudf::struct_view decode_value(const char*, - const char*, - parse_options_view const&) -{ - return cudf::struct_view{}; -} - -template <> -__inline__ __device__ numeric::decimal32 decode_value(const char*, - const char*, - parse_options_view const&) -{ - return numeric::decimal32{}; -} - -template <> -__inline__ __device__ numeric::decimal64 decode_value(const char*, - const char*, - parse_options_view const&) -{ - return numeric::decimal64{}; -} - -template <> -__inline__ __device__ numeric::decimal128 decode_value(const char*, - const char*, - parse_options_view const&) -{ - return numeric::decimal128{}; -} - -/** - * @brief Functor for converting plain text data to cuDF data type value. - */ -struct ConvertFunctor { - /** - * @brief Template specialization for operator() for types whose values can be - * convertible to a 0 or 1 to represent false/true. The converting is done by - * checking against the default and user-specified true/false values list. - * - * It is handled here rather than within convertStrToValue() as that function - * is used by other types (ex. timestamp) that aren't 'booleable'. - */ - template >* = nullptr> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* output_column, - cudf::size_type row, - const parse_options_view& opts) - { - T& value{static_cast(output_column)[row]}; - - value = [&opts, end, begin]() -> T { - // Check for user-specified true/false values - auto const len = static_cast(end - begin); - if (serialized_trie_contains(opts.trie_true, {begin, len})) { return 1; } - if (serialized_trie_contains(opts.trie_false, {begin, len})) { return 0; } - return decode_value(begin, end, opts); - }(); - - return true; - } - - /** - * @brief Dispatch for floating points, which are set to NaN if the input - * is not valid. In such case, the validity mask is set to zero too. - */ - template >* = nullptr> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* out_buffer, - size_t row, - parse_options_view const& opts) - { - T const value = decode_value(begin, end, opts); - static_cast(out_buffer)[row] = value; - - return !std::isnan(value); - } - - /** - * @brief Default template operator() dispatch specialization all data types - * (including wrapper types) that is not covered by above. - */ - template and !std::is_integral_v>* = nullptr> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* output_column, - cudf::size_type row, - const parse_options_view& opts) - { - static_cast(output_column)[row] = decode_value(begin, end, opts); - - return true; - } -}; - /** * @brief Returns true is the input character is a valid digit. * Supports both decimal and hexadecimal digits (uppercase and lowercase). diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 1d97d4a344a..816a6d25db3 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -16,17 +16,21 @@ #pragma once +#include +#include + #include +#include +#include +#include #include -#include +#include #include "column_type_histogram.hpp" #include -#include #include -#include #include #include @@ -501,33 +505,174 @@ __inline__ __device__ std::pair trim_whitespaces_quote } /** - * @brief Excludes the prefix from the input range if the string starts with the prefix. + * @brief Decodes a numeric value base on templated cudf type T with specified + * base. * - * @tparam N length on the prefix, plus one - * @param[in, out] begin Pointer to the first element of the string - * @param end Pointer to the first element after the string - * @param prefix String we're searching for at the start of the input range + * @param[in] begin Beginning of the character string + * @param[in] end End of the character string + * @param opts The global parsing behavior options + * + * @return The parsed numeric value */ -template -__inline__ __device__ auto skip_if_starts_with(char const* begin, - char const* end, - const char (&prefix)[N]) +template +__inline__ __device__ T decode_value(const char* begin, + const char* end, + parse_options_view const& opts) { - static constexpr size_t prefix_len = N - 1; - if (end - begin < prefix_len) return begin; - return thrust::equal(thrust::seq, begin, begin + prefix_len, prefix) ? begin + prefix_len : begin; + return cudf::io::parse_numeric(begin, end, opts); } /** - * @brief Finds the first element after the leading space characters. + * @brief Decodes a numeric value base on templated cudf type T * - * @param begin Pointer to the first element of the string - * @param end Pointer to the first element after the string + * @param[in] begin Beginning of the character string + * @param[in] end End of the character string + * @param opts The global parsing behavior options + * + * @return The parsed numeric value */ -__inline__ __device__ auto skip_spaces(char const* begin, char const* end) +template () and !cudf::is_duration()>* = nullptr> +__inline__ __device__ T decode_value(const char* begin, + const char* end, + parse_options_view const& opts) +{ + return cudf::io::parse_numeric(begin, end, opts); +} + +template ()>* = nullptr> +__inline__ __device__ T decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { - return thrust::find_if(thrust::seq, begin, end, [](auto elem) { return elem != ' '; }); + return to_timestamp(begin, end, opts.dayfirst); } +template ()>* = nullptr> +__inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&) +{ + return to_duration(begin, end); +} + +// The purpose of these is merely to allow compilation ONLY +template <> +__inline__ __device__ cudf::string_view decode_value(const char*, + const char*, + parse_options_view const&) +{ + return cudf::string_view{}; +} + +template <> +__inline__ __device__ cudf::dictionary32 decode_value(const char*, + const char*, + parse_options_view const&) +{ + return cudf::dictionary32{}; +} + +template <> +__inline__ __device__ cudf::list_view decode_value(const char*, + const char*, + parse_options_view const&) +{ + return cudf::list_view{}; +} +template <> +__inline__ __device__ cudf::struct_view decode_value(const char*, + const char*, + parse_options_view const&) +{ + return cudf::struct_view{}; +} + +template <> +__inline__ __device__ numeric::decimal32 decode_value(const char*, + const char*, + parse_options_view const&) +{ + return numeric::decimal32{}; +} + +template <> +__inline__ __device__ numeric::decimal64 decode_value(const char*, + const char*, + parse_options_view const&) +{ + return numeric::decimal64{}; +} + +template <> +__inline__ __device__ numeric::decimal128 decode_value(const char*, + const char*, + parse_options_view const&) +{ + return numeric::decimal128{}; +} + +struct ConvertFunctor { + /** + * @brief Template specialization for operator() for types whose values can be + * convertible to a 0 or 1 to represent false/true. The converting is done by + * checking against the default and user-specified true/false values list. + * + * It is handled here rather than within convertStrToValue() as that function + * is used by other types (ex. timestamp) that aren't 'booleable'. + */ + template >* = nullptr> + __host__ __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* output_column, + cudf::size_type row, + const parse_options_view& opts) + { + T& value{static_cast(output_column)[row]}; + + value = [&opts, end, begin]() -> T { + // Check for user-specified true/false values + auto const len = static_cast(end - begin); + if (serialized_trie_contains(opts.trie_true, {begin, len})) { return 1; } + if (serialized_trie_contains(opts.trie_false, {begin, len})) { return 0; } + return decode_value(begin, end, opts); + }(); + + return true; + } + + /** + * @brief Dispatch for floating points, which are set to NaN if the input + * is not valid. In such case, the validity mask is set to zero too. + */ + template >* = nullptr> + __host__ __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, + size_t row, + parse_options_view const& opts) + { + T const value = decode_value(begin, end, opts); + static_cast(out_buffer)[row] = value; + + return !std::isnan(value); + } + + /** + * @brief Default template operator() dispatch specialization all data types + * (including wrapper types) that is not covered by above. + */ + template and !std::is_integral_v>* = nullptr> + __host__ __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* output_column, + cudf::size_type row, + const parse_options_view& opts) + { + static_cast(output_column)[row] = decode_value(begin, end, opts); + + return true; + } +}; + } // namespace io } // namespace cudf From 61e42ec8dca264f26953bfcf5d1559cc2d5ac859 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 10 Aug 2022 16:13:27 -0700 Subject: [PATCH 042/173] more redundancy removed --- cpp/src/io/csv/csv_gpu.cu | 127 +------------------------ cpp/src/io/json/data_casting.cuh | 11 ++- cpp/src/io/json/json_gpu.cu | 4 +- cpp/src/io/utilities/parsing_utils.cuh | 96 ++++++++++++++----- 4 files changed, 90 insertions(+), 148 deletions(-) diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 8ee92b1462c..f8150ff76f6 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -290,125 +290,6 @@ __global__ void __launch_bounds__(csvparse_block_dim) } } -/** - * @brief Functor for converting CSV raw data to typed value. - */ -struct decode_op { - /** - * @brief Dispatch for numeric types whose values can be convertible to - * 0 or 1 to represent boolean false/true, based upon checking against a - * true/false values list. - * - * @return bool Whether the parsed value is valid. - */ - template and !std::is_same_v and - !cudf::is_fixed_point()>* = nullptr> - __host__ __device__ __forceinline__ bool operator()(void* out_buffer, - size_t row, - const data_type, - char const* begin, - char const* end, - parse_options_view const& opts, - column_parse::flags flags) - { - static_cast(out_buffer)[row] = [&flags, &opts, begin, end]() -> T { - // Check for user-specified true/false values - auto const field_len = static_cast(end - begin); - if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; } - if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; } - return flags & column_parse::as_hexadecimal ? decode_value(begin, end, opts) - : decode_value(begin, end, opts); - }(); - - return true; - } - - /** - * @brief Dispatch for fixed point types. - * - * @return bool Whether the parsed value is valid. - */ - template ()>* = nullptr> - __host__ __device__ __forceinline__ bool operator()(void* out_buffer, - size_t row, - const data_type output_type, - char const* begin, - char const* end, - parse_options_view const& opts, - column_parse::flags flags) - { - static_cast*>(out_buffer)[row] = - [&flags, &opts, output_type, begin, end]() -> device_storage_type_t { - return strings::detail::parse_decimal>( - begin, end, output_type.scale()); - }(); - - return true; - } - - /** - * @brief Dispatch for boolean type types. - */ - template >* = nullptr> - __host__ __device__ __forceinline__ bool operator()(void* out_buffer, - size_t row, - const data_type, - char const* begin, - char const* end, - parse_options_view const& opts, - column_parse::flags flags) - { - static_cast(out_buffer)[row] = [&opts, begin, end]() { - // Check for user-specified true/false values - auto const field_len = static_cast(end - begin); - if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } - if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } - return decode_value(begin, end, opts); - }(); - - return true; - } - - /** - * @brief Dispatch for floating points, which are set to NaN if the input - * is not valid. In such case, the validity mask is set to zero too. - */ - template >* = nullptr> - __host__ __device__ __forceinline__ bool operator()(void* out_buffer, - size_t row, - const data_type, - char const* begin, - char const* end, - parse_options_view const& opts, - column_parse::flags flags) - { - T const value = decode_value(begin, end, opts); - static_cast(out_buffer)[row] = value; - - return !std::isnan(value); - } - - /** - * @brief Dispatch for all other types. - */ - template and !std::is_floating_point_v and - !cudf::is_fixed_point()>* = nullptr> - __host__ __device__ __forceinline__ bool operator()(void* out_buffer, - size_t row, - const data_type, - char const* begin, - char const* end, - parse_options_view const& opts, - column_parse::flags flags) - { - static_cast(out_buffer)[row] = decode_value(begin, end, opts); - - return true; - } -}; - /** * @brief CUDA kernel that parses and converts CSV data into cuDF column data. * @@ -479,14 +360,14 @@ __global__ void __launch_bounds__(csvparse_block_dim) str_list[rec_id].second = end - field_start; } else { if (cudf::type_dispatcher(dtypes[actual_col], - decode_op{}, + ConvertFunctor{}, + field_start, + field_end, columns[actual_col], rec_id, dtypes[actual_col], - field_start, - field_end, options, - column_flags[col])) { + column_flags[col] & column_parse::as_hexadecimal)) { // set the valid bitmap - all bits were set to 0 to start set_bit(valids[actual_col], rec_id); } diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index de1232bb84d..fe4226ec6e0 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -68,8 +68,15 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, [str_tuples, col = *output_dv_ptr, opts = parse_opts.view(), col_type] __device__( size_type row_idx) mutable { auto const in = str_tuples[row_idx]; - cudf::type_dispatcher( - col_type, ConvertFunctor{}, in.first, in.first + in.second, col.data(), row_idx, opts); + cudf::type_dispatcher(col_type, + ConvertFunctor{}, + in.first, + in.first + in.second, + col.data(), + row_idx, + col_type, + opts, + false); }); return out_col; diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index a3e8c5e3a7b..dbfcca7d37a 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -278,7 +278,9 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts, desc.value_end, output_columns[desc.column], rec_id, - opts)) { + column_types[desc.column], + opts, + false)) { // set the valid bitmap - all bits were set to 0 to start set_bit(valid_fields[desc.column], rec_id); atomicAdd(&num_valid_fields[desc.column], 1); diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 816a6d25db3..1da90636465 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -612,27 +613,74 @@ __inline__ __device__ numeric::decimal128 decode_value(const char*, struct ConvertFunctor { /** - * @brief Template specialization for operator() for types whose values can be - * convertible to a 0 or 1 to represent false/true. The converting is done by - * checking against the default and user-specified true/false values list. + * @brief Dispatch for numeric types whose values can be convertible to + * 0 or 1 to represent boolean false/true, based upon checking against a + * true/false values list. * - * It is handled here rather than within convertStrToValue() as that function - * is used by other types (ex. timestamp) that aren't 'booleable'. + * @return bool Whether the parsed value is valid. */ - template >* = nullptr> + template and !std::is_same_v and + !cudf::is_fixed_point()>* = nullptr> + __host__ __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, + size_t row, + const data_type output_type, + parse_options_view const& opts, + bool as_hex = false) + { + static_cast(out_buffer)[row] = [as_hex, &opts, begin, end]() -> T { + // Check for user-specified true/false values + auto const field_len = static_cast(end - begin); + if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; } + if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; } + return as_hex ? decode_value(begin, end, opts) : decode_value(begin, end, opts); + }(); + + return true; + } + + /** + * @brief Dispatch for fixed point types. + * + * @return bool Whether the parsed value is valid. + */ + template ()>* = nullptr> __host__ __device__ __forceinline__ bool operator()(char const* begin, char const* end, - void* output_column, - cudf::size_type row, - const parse_options_view& opts) + void* out_buffer, + size_t row, + const data_type output_type, + parse_options_view const& opts, + bool as_hex) { - T& value{static_cast(output_column)[row]}; + static_cast*>(out_buffer)[row] = + [&opts, output_type, begin, end]() -> device_storage_type_t { + return strings::detail::parse_decimal>( + begin, end, output_type.scale()); + }(); + + return true; + } - value = [&opts, end, begin]() -> T { + /** + * @brief Dispatch for boolean type types. + */ + template >* = nullptr> + __host__ __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, + size_t row, + const data_type output_type, + parse_options_view const& opts, + bool as_hex) + { + static_cast(out_buffer)[row] = [&opts, begin, end]() { // Check for user-specified true/false values - auto const len = static_cast(end - begin); - if (serialized_trie_contains(opts.trie_true, {begin, len})) { return 1; } - if (serialized_trie_contains(opts.trie_false, {begin, len})) { return 0; } + auto const field_len = static_cast(end - begin); + if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } + if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } return decode_value(begin, end, opts); }(); @@ -648,7 +696,9 @@ struct ConvertFunctor { char const* end, void* out_buffer, size_t row, - parse_options_view const& opts) + const data_type output_type, + parse_options_view const& opts, + bool as_hex) { T const value = decode_value(begin, end, opts); static_cast(out_buffer)[row] = value; @@ -657,18 +707,20 @@ struct ConvertFunctor { } /** - * @brief Default template operator() dispatch specialization all data types - * (including wrapper types) that is not covered by above. + * @brief Dispatch for all other types. */ template and !std::is_integral_v>* = nullptr> + std::enable_if_t and !std::is_floating_point_v and + !cudf::is_fixed_point()>* = nullptr> __host__ __device__ __forceinline__ bool operator()(char const* begin, char const* end, - void* output_column, - cudf::size_type row, - const parse_options_view& opts) + void* out_buffer, + size_t row, + const data_type output_type, + parse_options_view const& opts, + bool as_hex) { - static_cast(output_column)[row] = decode_value(begin, end, opts); + static_cast(out_buffer)[row] = decode_value(begin, end, opts); return true; } From db505d8a44ae0886e4783d06293e3d0d9e793d99 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 10 Aug 2022 21:04:29 -0700 Subject: [PATCH 043/173] fixed width null/true/false support --- cpp/src/io/json/data_casting.cuh | 30 +++++++++++++++++++---------- cpp/tests/io/json_type_cast_test.cu | 22 ++++++++++++++++----- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index fe4226ec6e0..9a0d00a2242 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -66,17 +66,27 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, thrust::make_counting_iterator(0), col_size, [str_tuples, col = *output_dv_ptr, opts = parse_opts.view(), col_type] __device__( - size_type row_idx) mutable { + size_type row_idx) { auto const in = str_tuples[row_idx]; - cudf::type_dispatcher(col_type, - ConvertFunctor{}, - in.first, - in.first + in.second, - col.data(), - row_idx, - col_type, - opts, - false); + + auto const is_null_literal = + serialized_trie_contains(opts.trie_na, {in.first, static_cast(in.second)}); + + if (is_null_literal) { + col.set_null(row_idx); + return; + } + + auto const is_parsed = cudf::type_dispatcher(col_type, + ConvertFunctor{}, + in.first, + in.first + in.second, + col.data(), + row_idx, + col_type, + opts, + false); + if (not is_parsed) { col.set_null(row_idx); } }); return out_col; diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 09cccb7f30f..94713e16606 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,8 @@ #include +using namespace cudf::test::iterators; + struct JSONTypeCastTest : public cudf::test::BaseFixture { }; @@ -59,8 +62,12 @@ TEST_F(JSONTypeCastTest, String) svs.begin(), to_thrust_pair_fn{}); + auto null_mask_it = no_nulls(); + auto null_mask = + cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()); + auto str_col = cudf::io::json::experimental::parse_data( - svs.data(), svs.size(), type, rmm::device_buffer{0, stream}, stream, mr); + svs.data(), svs.size(), type, std::move(null_mask), stream, mr); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(str_col->view(), data); } @@ -68,9 +75,9 @@ TEST_F(JSONTypeCastTest, Int) { auto const stream = rmm::cuda_stream_default; auto mr = rmm::mr::get_current_device_resource(); - auto const type = cudf::data_type{cudf::type_id::INT32}; + auto const type = cudf::data_type{cudf::type_id::INT64}; - cudf::test::strings_column_wrapper data({"1", "2", "3", "4", "5", "6"}); + cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"}); auto d_column = cudf::column_device_view::create(data); rmm::device_uvector> svs(d_column->size(), stream); thrust::transform(thrust::device, @@ -79,10 +86,15 @@ TEST_F(JSONTypeCastTest, Int) svs.begin(), to_thrust_pair_fn{}); + auto null_mask_it = no_nulls(); + auto null_mask = + cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()); + auto col = cudf::io::json::experimental::parse_data( - svs.data(), svs.size(), type, rmm::device_buffer{0, stream}, stream, mr); + svs.data(), svs.size(), type, std::move(null_mask), stream, mr); - auto expected = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 4, 5, 6}}; + auto expected = + cudf::test::fixed_width_column_wrapper{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}}; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected); } From 988f788581bfe8b951b5fcc2ac76c7e5a0987572 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 11 Aug 2022 15:55:50 -0700 Subject: [PATCH 044/173] skip input nulls --- cpp/src/io/json/data_casting.cuh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index 9a0d00a2242..d8e5aa61167 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -66,14 +66,15 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, thrust::make_counting_iterator(0), col_size, [str_tuples, col = *output_dv_ptr, opts = parse_opts.view(), col_type] __device__( - size_type row_idx) { - auto const in = str_tuples[row_idx]; + size_type row) { + if (col.is_null(row)) { return; } + auto const in = str_tuples[row]; auto const is_null_literal = serialized_trie_contains(opts.trie_na, {in.first, static_cast(in.second)}); if (is_null_literal) { - col.set_null(row_idx); + col.set_null(row); return; } @@ -82,11 +83,11 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, in.first, in.first + in.second, col.data(), - row_idx, + row, col_type, opts, false); - if (not is_parsed) { col.set_null(row_idx); } + if (not is_parsed) { col.set_null(row); } }); return out_col; From f69c7eaaaf49a0090c94d88b09671941f4ebc219 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 11 Aug 2022 15:57:23 -0700 Subject: [PATCH 045/173] pass options --- cpp/src/io/json/data_casting.cuh | 14 ++++---------- cpp/tests/io/json_type_cast_test.cu | 15 +++++++++++++-- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index d8e5aa61167..dfd2936a49e 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -42,15 +42,10 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, size_type col_size, data_type col_type, B&& null_mask, + cudf::io::parse_options_view const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto parse_opts = parse_options{',', '\n', '\"', '.'}; - - parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); - parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); - parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - if (col_type == cudf::data_type{cudf::type_id::STRING}) { auto const strings_span = coalesce_input(str_tuples, col_size, stream); return make_strings_column(strings_span, stream); @@ -65,13 +60,12 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, rmm::exec_policy(stream), thrust::make_counting_iterator(0), col_size, - [str_tuples, col = *output_dv_ptr, opts = parse_opts.view(), col_type] __device__( - size_type row) { + [str_tuples, col = *output_dv_ptr, options, col_type] __device__(size_type row) { if (col.is_null(row)) { return; } auto const in = str_tuples[row]; auto const is_null_literal = - serialized_trie_contains(opts.trie_na, {in.first, static_cast(in.second)}); + serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); if (is_null_literal) { col.set_null(row); @@ -85,7 +79,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, col.data(), row, col_type, - opts, + options, false); if (not is_parsed) { col.set_null(row); } }); diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 94713e16606..3b6099e28ba 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -47,6 +47,17 @@ struct to_thrust_pair_fn { }; } // namespace +auto default_json_options() +{ + auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; + + auto const stream = rmm::cuda_stream_default; + parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + return parse_opts; +} + TEST_F(JSONTypeCastTest, String) { auto const stream = rmm::cuda_stream_default; @@ -67,7 +78,7 @@ TEST_F(JSONTypeCastTest, String) cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()); auto str_col = cudf::io::json::experimental::parse_data( - svs.data(), svs.size(), type, std::move(null_mask), stream, mr); + svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(str_col->view(), data); } @@ -91,7 +102,7 @@ TEST_F(JSONTypeCastTest, Int) cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()); auto col = cudf::io::json::experimental::parse_data( - svs.data(), svs.size(), type, std::move(null_mask), stream, mr); + svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); auto expected = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}}; From 41c221fc8655d27f1530fe46fb54fa985c43c0c0 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 11 Aug 2022 18:05:04 -0700 Subject: [PATCH 046/173] basic string column rewrite --- cpp/src/io/json/data_casting.cuh | 38 +++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index dfd2936a49e..e08e8c79162 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -28,15 +28,6 @@ namespace cudf::io::json::experimental { -template -rmm::device_uvector> coalesce_input( - str_tuple_it str_tuples, size_type col_size, rmm::cuda_stream_view stream) -{ - auto result = rmm::device_uvector>(col_size, stream); - thrust::copy_n(rmm::exec_policy(stream), str_tuples, col_size, result.begin()); - return result; -} - template std::unique_ptr parse_data(str_tuple_it str_tuples, size_type col_size, @@ -47,8 +38,33 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, rmm::mr::device_memory_resource* mr) { if (col_type == cudf::data_type{cudf::type_id::STRING}) { - auto const strings_span = coalesce_input(str_tuples, col_size, stream); - return make_strings_column(strings_span, stream); + rmm::device_uvector offsets(col_size + 1, stream); + thrust::transform(rmm::exec_policy(stream), + str_tuples, + str_tuples + col_size, + offsets.begin(), + [] __device__(auto const& str) { return str.second; }); + thrust::exclusive_scan( + rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin()); + + rmm::device_uvector chars(offsets.back_element(stream), stream); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, + chars = device_span{chars}, + offsets = device_span{offsets}, + null_mask = static_cast(null_mask.data()), + options] __device__(size_type row) { + if (not bit_is_set(null_mask, row)) { return; } + auto const in = str_tuples[row]; + for (int i = 0; i < in.second; ++i) { + chars[offsets[row] + i] = *(in.first + i); + } + }); + + return make_strings_column( + col_size, std::move(offsets), std::move(chars), std::move(null_mask)); } auto out_col = make_fixed_width_column( From 5d73a8ac47cec35a451765554172a64ac7b2aa62 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 11 Aug 2022 23:37:27 -0700 Subject: [PATCH 047/173] string null literal + null pass throught --- cpp/src/io/json/data_casting.cuh | 29 ++++++++++++++++++++++++----- cpp/tests/io/json_type_cast_test.cu | 15 ++++++++++++--- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index e08e8c79162..437b3093af7 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -39,11 +39,30 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, { if (col_type == cudf::data_type{cudf::type_id::STRING}) { rmm::device_uvector offsets(col_size + 1, stream); - thrust::transform(rmm::exec_policy(stream), - str_tuples, - str_tuples + col_size, - offsets.begin(), - [] __device__(auto const& str) { return str.second; }); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, + sizes = device_span{offsets}, + null_mask = static_cast(null_mask.data()), + options] __device__(size_type row) { + if (not bit_is_set(null_mask, row)) { + sizes[row] = 0; + return; + } + auto const in = str_tuples[row]; + + auto const is_null_literal = serialized_trie_contains( + options.trie_na, {in.first, static_cast(in.second)}); + if (is_null_literal) { + sizes[row] = 0; + clear_bit(null_mask, row); + return; + } + + sizes[row] = in.second; + }); + thrust::exclusive_scan( rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin()); diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 3b6099e28ba..dde4dcc8ba1 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -64,8 +64,11 @@ TEST_F(JSONTypeCastTest, String) auto mr = rmm::mr::get_current_device_resource(); auto const type = cudf::data_type{cudf::type_id::STRING}; - cudf::test::strings_column_wrapper data({"this", "is", "a", "column", "of", "strings"}); - auto d_column = cudf::column_device_view::create(data); + auto in_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; }); + std::vector input_values{"this", "is", "null", "of", "", "strings"}; + cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end(), in_valids); + + auto d_column = cudf::column_device_view::create(input); rmm::device_uvector> svs(d_column->size(), stream); thrust::transform(thrust::device, d_column->pair_begin(), @@ -79,7 +82,13 @@ TEST_F(JSONTypeCastTest, String) auto str_col = cudf::io::json::experimental::parse_data( svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(str_col->view(), data); + + auto out_valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; }); + std::vector expected_values{"this", "is", "", "of", "", "strings"}; + cudf::test::strings_column_wrapper expected( + expected_values.begin(), expected_values.end(), out_valids); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(str_col->view(), expected); } TEST_F(JSONTypeCastTest, Int) From 22b5a46c8a8bb6dfbdaad9827452dbc1792be375 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 15 Aug 2022 07:29:48 -0700 Subject: [PATCH 048/173] adds support for ndjson --- cpp/src/io/json/nested_json.hpp | 14 +- cpp/src/io/json/nested_json_gpu.cu | 371 +++++++++++++++++++---------- cpp/tests/io/nested_json_test.cpp | 61 ++++- 3 files changed, 316 insertions(+), 130 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 03acd393594..d8886bc0928 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -267,14 +268,14 @@ namespace detail { * At this stage, we do not perform bracket matching, i.e., we do not verify whether a closing * bracket would actually pop a the corresponding opening brace. * - * @param[in] d_json_in The string of input characters + * @param[in] json_in The string of input characters * @param[out] d_top_of_stack Will be populated with what-is-on-top-of-the-stack for any given input * character of \p d_json_in, where a '{' represents that the corresponding input character is * within the context of a struct, a '[' represents that it is within the context of an array, and a * '_' symbol that it is at the root of the JSON. * @param[in] stream The cuda stream to dispatch GPU kernels to */ -void get_stack_context(device_span d_json_in, +void get_stack_context(device_span json_in, SymbolT* d_top_of_stack, rmm::cuda_stream_view stream); @@ -282,14 +283,17 @@ void get_stack_context(device_span d_json_in, * @brief Parses the given JSON string and emits a sequence of tokens that demarcate relevant * sections from the input. * - * @param[in] d_json_in The JSON input + * @param[in] json_in The JSON input + * @param[in] options Parsing options specifying the parsing behaviour * @param[out] d_tokens Device memory to which the parsed tokens are written * @param[out] d_tokens_indices Device memory to which the indices are written, where each index * represents the offset within \p d_json_in that cause the input being written * @param[out] d_num_written_tokens The total number of tokens that were parsed * @param[in] stream The CUDA stream to which kernels are dispatched */ -void get_token_stream(device_span d_json_in, + +void get_token_stream(device_span json_in, + cudf::io::json_reader_options const& options, PdaTokenT* d_tokens, SymbolOffsetT* d_tokens_indices, SymbolOffsetT* d_num_written_tokens, @@ -299,12 +303,14 @@ void get_token_stream(device_span d_json_in, * @brief Parses the given JSON string and generates table from the given input. * * @param input The JSON input + * @param options Parsing options specifying the parsing behaviour * @param stream The CUDA stream to which kernels are dispatched * @param mr Optional, resource with which to allocate. * @return The data parsed from the given JSON input */ table_with_metadata parse_nested_json( host_span input, + cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 5e293f8a750..f8a862e2c65 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -164,6 +165,8 @@ enum class symbol_group_id : PdaSymbolGroupIdT { COLON, /// Whitespace WHITE_SPACE, + /// Linebreak + LINE_BREAK, /// Other (any input symbol not assigned to one of the above symbol groups) OTHER, /// Total number of symbol groups amongst which to differentiate @@ -206,7 +209,7 @@ static __constant__ PdaSymbolGroupIdT tos_sg_to_pda_sgid[] = { static_cast(symbol_group_id::OTHER), static_cast(symbol_group_id::OTHER), static_cast(symbol_group_id::WHITE_SPACE), - static_cast(symbol_group_id::WHITE_SPACE), + static_cast(symbol_group_id::LINE_BREAK), static_cast(symbol_group_id::OTHER), static_cast(symbol_group_id::OTHER), static_cast(symbol_group_id::WHITE_SPACE), @@ -403,62 +406,62 @@ constexpr auto PD_NUM_STATES = static_cast(pda_state_t::PD_NUM_STATES); // The starting state of the pushdown automaton constexpr auto start_state = static_cast(pda_state_t::PD_BOV); -// Identity symbol to symbol group lookup table -std::vector> const pda_sgids{ - {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}, {13}, {14}, - {15}, {16}, {17}, {18}, {19}, {20}, {21}, {22}, {23}, {24}, {25}, {26}, {27}, {28}, {29}}; - /** * @brief Getting the transition table */ -auto get_transition_table() +auto get_transition_table(bool newline_delimited_json) { + static_assert(static_cast(stack_symbol_group_id::STACK_ROOT) == 0); + static_assert(static_cast(stack_symbol_group_id::STACK_LIST) == 1); + static_assert(static_cast(stack_symbol_group_id::STACK_STRUCT) == 2); + + auto const PD_ANL = newline_delimited_json ? PD_BOV : PD_PVL; std::array, PD_NUM_STATES> pda_tt; - // { [ } ] " \ , : space other + // { [ } ] " \ , : space newline other pda_tt[static_cast(pda_state_t::PD_BOV)] = { - PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON, - PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON, - PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON}; + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON}; pda_tt[static_cast(pda_state_t::PD_BOA)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_LON, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_LON)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_LON, - PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_LON, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_LON}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_LON}; pda_tt[static_cast(pda_state_t::PD_STR)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR}; + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; pda_tt[static_cast(pda_state_t::PD_SCE)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; pda_tt[static_cast(pda_state_t::PD_PVL)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_ERR, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ANL, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_BFN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_BFN, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_FLN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; pda_tt[static_cast(pda_state_t::PD_FNE)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; pda_tt[static_cast(pda_state_t::PD_PFN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_PFN, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_ERR)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR}; return pda_tt; } @@ -468,7 +471,8 @@ auto get_transition_table() auto get_translation_table() { std::array, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt; - pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{{token_t::StructBegin}, + pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{/*ROOT*/ + {token_t::StructBegin}, {token_t::ListBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -477,7 +481,9 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ValueBegin}, + /*LIST*/ {token_t::StructBegin}, {token_t::ListBegin}, {token_t::ErrorBegin}, @@ -487,7 +493,9 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ValueBegin}, + /*STRUCT*/ {token_t::StructBegin}, {token_t::ListBegin}, {token_t::ErrorBegin}, @@ -497,8 +505,10 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ValueBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_BOA)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_BOA)] = {{/*ROOT*/ + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -508,6 +518,8 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + /*LIST*/ {token_t::StructBegin}, {token_t::ListBegin}, {token_t::ErrorBegin}, @@ -517,7 +529,9 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ValueBegin}, + /*STRUCT*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::StructEnd}, @@ -527,8 +541,10 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_LON)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_LON)] = {{/*ROOT*/ + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -537,7 +553,9 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ValueEnd}, + {token_t::ValueEnd}, {}, + /*LIST*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -547,7 +565,9 @@ auto get_translation_table() {token_t::ValueEnd}, {token_t::ErrorBegin}, {token_t::ValueEnd}, + {token_t::ValueEnd}, {}, + /*STRUCT*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ValueEnd, token_t::StructEnd}, @@ -557,15 +577,82 @@ auto get_translation_table() {token_t::ValueEnd}, {token_t::ErrorBegin}, {token_t::ValueEnd}, + {token_t::ValueEnd}, + {}}}; + pda_tlt[static_cast(pda_state_t::PD_STR)] = {{/*ROOT*/ + {}, + {}, + {}, + {}, + {token_t::StringEnd}, + {}, + {}, + {}, + {}, + {}, + {}, + /*LIST*/ + {}, + {}, + {}, + {}, + {token_t::StringEnd}, + {}, + {}, + {}, + {}, + {}, + {}, + /*STRUCT*/ + {}, + {}, + {}, + {}, + {token_t::StringEnd}, + {}, + {}, + {}, + {}, + {}, + {}}}; + pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{/*ROOT*/ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + /*LIST*/ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + /*STRUCT*/ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, {}}}; - pda_tlt[static_cast(pda_state_t::PD_STR)] = { - {{}, {}, {}, {}, {token_t::StringEnd}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {token_t::StringEnd}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {token_t::StringEnd}, {}, {}, {}, {}, {}}}; - pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}, {}}}; - pda_tlt[static_cast(pda_state_t::PD_PVL)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_PVL)] = {{/*ROOT*/ + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -574,7 +661,9 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ErrorBegin}, + /*LIST*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -584,7 +673,9 @@ auto get_translation_table() {}, {token_t::ErrorBegin}, {}, + {}, {token_t::ErrorBegin}, + /*STRUCT*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::StructEnd}, @@ -594,8 +685,11 @@ auto get_translation_table() {}, {token_t::ErrorBegin}, {}, + {}, {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_BFN)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_BFN)] = {{/*ROOT*/ + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -605,6 +699,7 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + /*LIST*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -616,6 +711,8 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + /*STRUCT*/ + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -624,8 +721,9 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{/*ROOT*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -637,6 +735,7 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + /*LIST*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -645,6 +744,10 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + /*STRUCT*/ {}, {}, {}, @@ -654,8 +757,10 @@ auto get_translation_table() {}, {}, {}, + {}, {}}}; - pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{/*ROOT*/ + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -666,6 +771,7 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + /*LIST*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -675,6 +781,10 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + /*STRUCT*/ + {}, {}, {}, {}, @@ -685,7 +795,8 @@ auto get_translation_table() {}, {}, {}}}; - pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{/*ROOT*/ + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -696,6 +807,7 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + /*LIST*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -707,17 +819,54 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + /*STRUCT*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, {}, {}, {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}, {}}}; + pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{/*ROOT*/ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + /*LIST*/ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + /*STRUCT*/ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}}}; return pda_tlt; } @@ -792,11 +941,14 @@ void get_stack_context(device_span json_in, // TODO: return pair of device_uvector instead of passing pre-allocated pointers. void get_token_stream(device_span json_in, + cudf::io::json_reader_options const& options, PdaTokenT* d_tokens, SymbolOffsetT* d_tokens_indices, SymbolOffsetT* d_num_written_tokens, rmm::cuda_stream_view stream) { + auto const new_line_delimited_json = options.is_enabled_lines(); + // Memory holding the top-of-stack stack context for the input rmm::device_uvector stack_op_indices{json_in.size(), stream}; @@ -820,8 +972,12 @@ void get_token_stream(device_span json_in, tokenizer_pda::pda_state_t::PD_NUM_STATES)>; // Instantiating PDA transducer - ToTokenStreamFstT json_to_tokens_fst{tokenizer_pda::pda_sgids, - tokenizer_pda::get_transition_table(), + std::vector> pda_sgid_identity{tokenizer_pda::NUM_PDA_SGIDS}; + std::generate(std::begin(pda_sgid_identity), std::end(pda_sgid_identity), [i = 0]() mutable { + return std::vector{static_cast(i++)}; + }); + ToTokenStreamFstT json_to_tokens_fst{pda_sgid_identity, + tokenizer_pda::get_transition_table(new_line_delimited_json), tokenizer_pda::get_translation_table(), stream}; @@ -850,6 +1006,7 @@ void make_json_column(json_column& root_column, std::stack& current_data_path, host_span input, device_span d_input, + cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream) { // Default name for a list's child column @@ -862,6 +1019,7 @@ void make_json_column(json_column& root_column, // Parse the JSON and get the token stream get_token_stream(d_input, + options, tokens_gpu.device_ptr(), token_indices_gpu.device_ptr(), num_tokens_out.device_ptr(), @@ -897,15 +1055,6 @@ void make_json_column(json_column& root_column, }; }; - // Whether this token is a beginning-of-list or beginning-of-struct token - auto is_nested_token = [](PdaTokenT const token) { - switch (token) { - case token_t::StructBegin: - case token_t::ListBegin: return true; - default: return false; - }; - }; - // Skips the quote char if the token is a beginning-of-string or beginning-of-field-name token auto get_token_index = [](PdaTokenT const token, SymbolOffsetT const token_index) { constexpr SymbolOffsetT skip_quote_char = 1; @@ -1061,7 +1210,6 @@ void make_json_column(json_column& root_column, std::size_t offset = 0; // Giving names to magic constants - constexpr uint32_t row_offset_zero = 0; constexpr uint32_t zero_child_count = 0; //-------------------------------------------------------------------------------- @@ -1071,51 +1219,6 @@ void make_json_column(json_column& root_column, CUDF_EXPECTS(num_tokens_out[0] > 0, "Empty JSON input not supported"); CUDF_EXPECTS(is_valid_root_token(tokens_gpu[offset]), "Invalid beginning of JSON document"); - // The JSON root is either a struct or list - if (is_nested_token(tokens_gpu[offset])) { - // Initialize the root column and append this row to it - root_column.append_row(row_offset_zero, - token_to_column_type(tokens_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), - 0); - - // Push the root node onto the stack for the data path - current_data_path.push({&root_column, row_offset_zero, nullptr, zero_child_count}); - - // Continue with the next token from the token stream - offset++; - } - // The JSON is a simple scalar value -> create simple table and return - else { - constexpr SymbolOffsetT max_tokens_for_scalar_value = 2; - CUDF_EXPECTS(num_tokens_out[0] <= max_tokens_for_scalar_value, - "Invalid JSON format. Expected just a scalar value."); - - // If this isn't the only token, verify the subsequent token is the correct end-of-* partner - if ((offset + 1) < num_tokens_out[0]) { - CUDF_EXPECTS(tokens_gpu[offset + 1] == end_of_partner(tokens_gpu[offset]), - "Invalid JSON token sequence"); - } - - // The offset to the first symbol from the JSON input associated with the current token - auto const& token_begin_offset = get_token_index(tokens_gpu[offset], token_indices_gpu[offset]); - - // The offset to one past the last symbol associated with the current token - // Literals without trailing space are missing the corresponding end-of-* counterpart. - auto const& token_end_offset = - (offset + 1 < num_tokens_out[0]) - ? get_token_index(tokens_gpu[offset + 1], token_indices_gpu[offset + 1]) - : input.size(); - - root_column.append_row(row_offset_zero, - json_col_t::StringColumn, - token_begin_offset, - token_end_offset, - zero_child_count); - return; - } - while (offset < num_tokens_out[0]) { // Verify there's at least the JSON root node left on the stack to which we can append data CUDF_EXPECTS(current_data_path.size() > 0, "Invalid JSON structure"); @@ -1215,6 +1318,7 @@ void make_json_column(json_column& root_column, else if (token == token_t::ErrorBegin) { #ifdef NJP_DEBUG_PRINT std::cout << "[ErrorBegin]\n"; + std::cout << "@" << get_token_index(tokens_gpu[offset], token_indices_gpu[offset]); #endif CUDF_FAIL("Parser encountered an invalid format."); } @@ -1371,26 +1475,51 @@ std::pair, std::vector> json_column_to } table_with_metadata parse_nested_json(host_span input, + cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + auto const new_line_delimited_json = options.is_enabled_lines(); + // Allocate device memory for the JSON input & copy over to device rmm::device_uvector d_input = cudf::detail::make_device_uvector_async(input, stream); // Get internal JSON column json_column root_column{}; std::stack data_path{}; - make_json_column(root_column, data_path, input, d_input, stream); + + constexpr uint32_t row_offset_zero = 0; + constexpr uint32_t token_begin_offset_zero = 0; + constexpr uint32_t token_end_offset_zero = 0; + constexpr uint32_t node_init_child_count_zero = 0; + + // We initialize the very root node and root column that represents a list column that contains + // all the values found at the root "level" of the given JSON string Initialize the root column + // For JSON lines: we expect to find a list of values that all will be inserted into this list + // column. + // For regular JSON: we expect to have only a single value (single row) that will be inserted into + // this column + root_column.append_row( + row_offset_zero, json_col_t::ListColumn, token_begin_offset_zero, token_end_offset_zero, 1); + + // Push the root node onto the stack for the data path + data_path.push({&root_column, row_offset_zero, nullptr, node_init_child_count_zero}); + + make_json_column(root_column, data_path, input, d_input, options, stream); + + // data_root refers to the root column of the data represented by the given JSON string + auto const& data_root = + new_line_delimited_json ? root_column : root_column.child_columns.begin()->second; // Verify that we were in fact given a list of structs (or in JSON speech: an array of objects) auto constexpr single_child_col_count = 1; - CUDF_EXPECTS(root_column.type == json_col_t::ListColumn and - root_column.child_columns.size() == single_child_col_count and - root_column.child_columns.begin()->second.type == json_col_t::StructColumn, + CUDF_EXPECTS(data_root.type == json_col_t::ListColumn and + data_root.child_columns.size() == single_child_col_count and + data_root.child_columns.begin()->second.type == json_col_t::StructColumn, "Currently the nested JSON parser only supports an array of (nested) objects"); // Slice off the root list column, which has only a single row that contains all the structs - auto const& root_struct_col = root_column.child_columns.begin()->second; + auto const& root_struct_col = data_root.child_columns.begin()->second; // Initialize meta data to be populated while recursing through the tree of columns std::vector> out_columns; diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index d426acf26f9..a217b2f7d18 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -251,6 +252,9 @@ TEST_F(JsonTest, TokenStream) rmm::cuda_stream stream{}; rmm::cuda_stream_view stream_view(stream); + // Default parsing options + cudf::io::json_reader_options default_options{}; + // Test input std::string input = R"( [{)" R"("category": "reference",)" @@ -282,6 +286,7 @@ TEST_F(JsonTest, TokenStream) // Parse the JSON and get the token stream cuio_json::detail::get_token_stream(d_input, + default_options, tokens_gpu.device_ptr(), token_indices_gpu.device_ptr(), num_tokens_out.device_ptr(), @@ -342,10 +347,13 @@ TEST_F(JsonTest, ExtractColumn) rmm::cuda_stream stream{}; rmm::cuda_stream_view stream_view(stream); + // Default parsing options + cudf::io::json_reader_options default_options{}; + std::string input = R"( [{"a":0.0, "b":1.0}, {"a":0.1, "b":1.1}, {"a":0.2, "b":1.2}] )"; // Get the JSON's tree representation auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, stream_view); + cudf::host_span{input.data(), input.size()}, default_options, stream_view); auto const expected_col_count = 2; auto const first_column_index = 0; @@ -366,6 +374,9 @@ TEST_F(JsonTest, UTF_JSON) rmm::cuda_stream stream{}; rmm::cuda_stream_view stream_view(stream); + // Default parsing options + cudf::io::json_reader_options default_options{}; + // Only ASCII string std::string ascii_pass = R"([ {"a":1,"b":2,"c":[3], "d": {}}, @@ -375,7 +386,8 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}])"; - CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(ascii_pass, stream_view)); + CUDF_EXPECT_NO_THROW( + cuio_json::detail::parse_nested_json(ascii_pass, default_options, stream_view)); // utf-8 string that fails parsing. std::string utf_failed = R"([ @@ -385,7 +397,8 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":8.0,"c":null, "d": {}}, {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "filip Ê’akotÉ›"}}])"; - CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_failed, stream_view)); + CUDF_EXPECT_NO_THROW( + cuio_json::detail::parse_nested_json(utf_failed, default_options, stream_view)); // utf-8 string that passes parsing. std::string utf_pass = R"([ @@ -396,7 +409,8 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}, {"a":1,"b":NaN,"c":[null, null], "d": {"year": 2, "author": "filip Ê’akotÉ›"}}])"; - CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_pass, stream_view)); + CUDF_EXPECT_NO_THROW( + cuio_json::detail::parse_nested_json(utf_pass, default_options, stream_view)); } TEST_F(JsonTest, FromParquet) @@ -410,6 +424,9 @@ TEST_F(JsonTest, FromParquet) rmm::cuda_stream stream{}; rmm::cuda_stream_view stream_view(stream); + // Default parsing options + cudf::io::json_reader_options default_options{}; + // Binary parquet data containing the same data as the data represented by the JSON string. // We could add a dataset to include this file, but we don't want tests in cudf to have data. const unsigned char parquet_data[] = { @@ -496,7 +513,7 @@ TEST_F(JsonTest, FromParquet) // Read in the data via the JSON parser auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, stream_view); + cudf::host_span{input.data(), input.size()}, default_options, stream_view); // Verify that the data read via parquet matches the data read via JSON CUDF_TEST_EXPECT_TABLES_EQUAL(cudf_table.tbl->view(), result.tbl->view()); @@ -504,3 +521,37 @@ TEST_F(JsonTest, FromParquet) // Verify that the schema read via parquet matches the schema read via JSON cudf::test::expect_metadata_equal(cudf_table.metadata, result.metadata); } + +TEST_F(JsonTest, JsonLines) +{ + // Prepare cuda stream for data transfers & kernels + rmm::cuda_stream stream{}; + rmm::cuda_stream_view stream_view(stream); + + // Default parsing options + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options_builder{}.lines(true); + + using cuio_json::SymbolT; + + std::string json_string = + R"({"a":"a0"} + {"a":"a1"} + {"a":"a2", "b":"b2"} + {"a":"a3", "c":"c3"} + {"a":"a4"})"; + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.c_str(), json_string.size()}) + .lines(true); + cudf::io::table_with_metadata old_reader_table = cudf::io::read_json(in_options); + + auto const new_reader_table = cuio_json::detail::parse_nested_json( + cudf::host_span{json_string.data(), json_string.size()}, + json_lines_options, + stream_view); + + // Verify that the data read via parquet matches the data read via JSON + CUDF_TEST_EXPECT_TABLES_EQUAL(old_reader_table.tbl->view(), new_reader_table.tbl->view()); +} From 87fce7d05c3a5377a2a68b2a374db1ebd127ee54 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 15 Aug 2022 10:48:12 -0700 Subject: [PATCH 049/173] addresses outstanding todo --- cpp/src/io/json/nested_json.hpp | 26 ++++++------- cpp/src/io/json/nested_json_gpu.cu | 61 ++++++++++++++++-------------- cpp/tests/io/nested_json_test.cpp | 26 +++++-------- 3 files changed, 53 insertions(+), 60 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index d8886bc0928..1048f9fcedd 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -285,27 +285,25 @@ void get_stack_context(device_span json_in, * * @param[in] json_in The JSON input * @param[in] options Parsing options specifying the parsing behaviour - * @param[out] d_tokens Device memory to which the parsed tokens are written - * @param[out] d_tokens_indices Device memory to which the indices are written, where each index * represents the offset within \p d_json_in that cause the input being written - * @param[out] d_num_written_tokens The total number of tokens that were parsed * @param[in] stream The CUDA stream to which kernels are dispatched + * @param[in] mr Optional, resource with which to allocate + * @return Pair of device vectors, where the first vector represents the token types and the second + * vector represents the index within the input corresponding to each token */ - -void get_token_stream(device_span json_in, - cudf::io::json_reader_options const& options, - PdaTokenT* d_tokens, - SymbolOffsetT* d_tokens_indices, - SymbolOffsetT* d_num_written_tokens, - rmm::cuda_stream_view stream); +std::pair, rmm::device_uvector> get_token_stream( + device_span json_in, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Parses the given JSON string and generates table from the given input. * - * @param input The JSON input - * @param options Parsing options specifying the parsing behaviour - * @param stream The CUDA stream to which kernels are dispatched - * @param mr Optional, resource with which to allocate. + * @param[in] input The JSON input + * @param[in] options Parsing options specifying the parsing behaviour + * @param[in] stream The CUDA stream to which kernels are dispatched + * @param[in] mr Optional, resource with which to allocate * @return The data parsed from the given JSON input */ table_with_metadata parse_nested_json( diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index f8a862e2c65..34c85402284 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -939,14 +939,17 @@ void get_stack_context(device_span json_in, stream); } -// TODO: return pair of device_uvector instead of passing pre-allocated pointers. -void get_token_stream(device_span json_in, - cudf::io::json_reader_options const& options, - PdaTokenT* d_tokens, - SymbolOffsetT* d_tokens_indices, - SymbolOffsetT* d_num_written_tokens, - rmm::cuda_stream_view stream) +std::pair, rmm::device_uvector> get_token_stream( + device_span json_in, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { + constexpr std::size_t single_item_count = 1ULL; + rmm::device_uvector tokens{json_in.size(), stream, mr}; + rmm::device_uvector tokens_indices{json_in.size(), stream, mr}; + rmm::device_uvector num_written_tokens{single_item_count, stream}; + auto const new_line_delimited_json = options.is_enabled_lines(); // Memory holding the top-of-stack stack context for the input @@ -984,11 +987,17 @@ void get_token_stream(device_span json_in, // Perform a PDA-transducer pass json_to_tokens_fst.Transduce(pda_sgids.begin(), static_cast(json_in.size()), - d_tokens, - d_tokens_indices, - d_num_written_tokens, + tokens.data(), + tokens_indices.data(), + num_written_tokens.data(), tokenizer_pda::start_state, stream); + + auto num_total_tokens = num_written_tokens.front_element(stream); + tokens.resize(num_total_tokens, stream); + tokens_indices.resize(num_total_tokens, stream); + + return std::make_pair(std::move(tokens), std::move(tokens_indices)); } /** @@ -1007,28 +1016,20 @@ void make_json_column(json_column& root_column, host_span input, device_span d_input, cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { // Default name for a list's child column std::string const list_child_name = "element"; - constexpr std::size_t single_item = 1; - hostdevice_vector tokens_gpu{input.size(), stream}; - hostdevice_vector token_indices_gpu{input.size(), stream}; - hostdevice_vector num_tokens_out{single_item, stream}; - // Parse the JSON and get the token stream - get_token_stream(d_input, - options, - tokens_gpu.device_ptr(), - token_indices_gpu.device_ptr(), - num_tokens_out.device_ptr(), - stream); + const auto [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr); // Copy the JSON tokens to the host - token_indices_gpu.device_to_host(stream); - tokens_gpu.device_to_host(stream); - num_tokens_out.device_to_host(stream); + thrust::host_vector tokens_gpu = + cudf::detail::make_host_vector_async(d_tokens_gpu, stream); + thrust::host_vector token_indices_gpu = + cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); // Make sure tokens have been copied to the host stream.synchronize(); @@ -1216,10 +1217,12 @@ void make_json_column(json_column& root_column, // INITIALIZE JSON ROOT NODE //-------------------------------------------------------------------------------- // The JSON root may only be a struct, list, string, or value node - CUDF_EXPECTS(num_tokens_out[0] > 0, "Empty JSON input not supported"); + CUDF_EXPECTS(tokens_gpu.size() == token_indices_gpu.size(), + "Unexpected mismatch in number of token types and token indices"); + CUDF_EXPECTS(tokens_gpu.size() > 0, "Empty JSON input not supported"); CUDF_EXPECTS(is_valid_root_token(tokens_gpu[offset]), "Invalid beginning of JSON document"); - while (offset < num_tokens_out[0]) { + while (offset < tokens_gpu.size()) { // Verify there's at least the JSON root node left on the stack to which we can append data CUDF_EXPECTS(current_data_path.size() > 0, "Invalid JSON structure"); @@ -1327,7 +1330,7 @@ void make_json_column(json_column& root_column, else if (token == token_t::FieldNameBegin or token == token_t::StringBegin or token == token_t::ValueBegin) { // Verify that this token has the right successor to build a correct (being, end) token pair - CUDF_EXPECTS((offset + 1) < num_tokens_out[0], "Invalid JSON token sequence"); + CUDF_EXPECTS((offset + 1) < tokens_gpu.size(), "Invalid JSON token sequence"); CUDF_EXPECTS(tokens_gpu[offset + 1] == end_of_partner(token), "Invalid JSON token sequence"); // The offset to the first symbol from the JSON input associated with the current token @@ -1505,7 +1508,7 @@ table_with_metadata parse_nested_json(host_span input, // Push the root node onto the stack for the data path data_path.push({&root_column, row_offset_zero, nullptr, node_init_child_count_zero}); - make_json_column(root_column, data_path, input, d_input, options, stream); + make_json_column(root_column, data_path, input, d_input, options, stream, mr); // data_root refers to the root column of the data represented by the given JSON string auto const& data_root = diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index a217b2f7d18..cae0083daed 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -246,8 +246,6 @@ TEST_F(JsonTest, TokenStream) using cuio_json::SymbolOffsetT; using cuio_json::SymbolT; - constexpr std::size_t single_item = 1; - // Prepare cuda stream for data transfers & kernels rmm::cuda_stream stream{}; rmm::cuda_stream_view stream_view(stream); @@ -280,22 +278,15 @@ TEST_F(JsonTest, TokenStream) cudaMemcpyHostToDevice, stream.value())); - hostdevice_vector tokens_gpu{input.size(), stream_view}; - hostdevice_vector token_indices_gpu{input.size(), stream_view}; - hostdevice_vector num_tokens_out{single_item, stream_view}; - // Parse the JSON and get the token stream - cuio_json::detail::get_token_stream(d_input, - default_options, - tokens_gpu.device_ptr(), - token_indices_gpu.device_ptr(), - num_tokens_out.device_ptr(), - stream_view); + const auto [d_tokens_gpu, d_token_indices_gpu] = + cuio_json::detail::get_token_stream(d_input, default_options, stream_view); // Copy back the number of tokens that were written - num_tokens_out.device_to_host(stream_view); - tokens_gpu.device_to_host(stream_view); - token_indices_gpu.device_to_host(stream_view); + thrust::host_vector tokens_gpu = + cudf::detail::make_host_vector_async(d_tokens_gpu, stream); + thrust::host_vector token_indices_gpu = + cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); // Make sure we copied back all relevant data stream_view.synchronize(); @@ -328,9 +319,10 @@ TEST_F(JsonTest, TokenStream) {267, token_t::StructEnd}, {268, token_t::ListEnd}}; // Verify the number of tokens matches - ASSERT_EQ(golden_token_stream.size(), num_tokens_out[0]); + ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size()); + ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size()); - for (std::size_t i = 0; i < num_tokens_out[0]; i++) { + for (std::size_t i = 0; i < tokens_gpu.size(); i++) { // Ensure the index the tokens are pointing to do match EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i; From 9669c6a1f49eebee8c7c7a5c251ff2e0a54afa98 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 15 Aug 2022 16:39:06 -0700 Subject: [PATCH 050/173] C++ side changes + test --- cpp/src/io/json/experimental/read_json.cpp | 10 ++------ cpp/tests/io/json_test.cpp | 27 ++++++++++++++++++---- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index fbe9b5f6112..0c579cbf035 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -15,19 +15,13 @@ */ #include "read_json.hpp" +#include #include #include namespace cudf::io::detail::json::experimental { -table_with_metadata read_nested_json(host_span input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FAIL("Not implemented"); -} - std::vector ingest_raw_input(host_span> sources, compression_type compression) { @@ -69,7 +63,7 @@ table_with_metadata read_json(host_span> sources, auto const buffer = ingest_raw_input(sources, reader_opts.get_compression()); auto data = host_span(reinterpret_cast(buffer.data()), buffer.size()); - return read_nested_json(data, stream, mr); + return cudf::io::json::detail::parse_nested_json(data, stream, mr); } } // namespace cudf::io::detail::json::experimental diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index c8aefece94f..3866def2cdf 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -915,13 +915,30 @@ TEST_F(JsonReaderTest, BadDtypeParams) EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error); } -TEST_F(JsonReaderTest, ExperimentalParam) +TEST_F(JsonReaderTest, JsonRecordsBasic) { - cudf_io::json_reader_options const options = - cudf_io::json_reader_options::builder(cudf_io::source_info{nullptr, 0}).experimental(true); + const std::string fname = temp_env->get_temp_dir() + "JsonLinesFileTest.json"; + std::ofstream outfile(fname, std::ofstream::out); + outfile << "[{\"a\":\"11\", \"b\":\"1.1\"},{\"a\":\"22\", \"b\":\"2.2\"}]"; + outfile.close(); + + cudf_io::json_reader_options options = + cudf_io::json_reader_options::builder(cudf_io::source_info{fname}).experimental(true); + auto result = cudf_io::read_json(options); + + EXPECT_EQ(result.tbl->num_columns(), 2); + EXPECT_EQ(result.tbl->num_rows(), 2); - // should throw for now - EXPECT_THROW(cudf_io::read_json(options), cudf::logic_error); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING); + + EXPECT_EQ(std::string(result.metadata.column_names[0]), "a"); + EXPECT_EQ(std::string(result.metadata.column_names[1]), "b"); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), + cudf::test::strings_column_wrapper({"11", "22"})); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), + cudf::test::strings_column_wrapper({"1.1", "2.2"})); } CUDF_TEST_PROGRAM_MAIN() From c9fb5b28d158ff198b042684f4a9d69803c7cb91 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 15 Aug 2022 16:51:58 -0700 Subject: [PATCH 051/173] working Python + test --- cpp/src/io/json/nested_json_gpu.cu | 4 +++- python/cudf/cudf/tests/test_json.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 5e293f8a750..bffc8891020 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1395,11 +1395,13 @@ table_with_metadata parse_nested_json(host_span input, // Initialize meta data to be populated while recursing through the tree of columns std::vector> out_columns; std::vector out_column_names; + std::vector out_root_column_names; // Iterate over the struct's child columns and convert to cudf column for (auto const& [col_name, json_col] : root_struct_col.child_columns) { // Insert this columns name into the schema out_column_names.emplace_back(col_name); + out_root_column_names.emplace_back(col_name); // Get this JSON column's cudf column and schema info auto [cudf_col, col_name_info] = json_column_to_cudf_column(json_col, d_input, stream, mr); @@ -1408,7 +1410,7 @@ table_with_metadata parse_nested_json(host_span input, } return table_with_metadata{std::make_unique(std::move(out_columns)), - {{}, out_column_names}}; + {out_root_column_names, out_column_names}}; } } // namespace detail diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 800ed68e8a4..5122c976f27 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -579,3 +579,22 @@ def test_json_experimental(): # should raise an exception, for now with pytest.raises(RuntimeError): cudf.read_json("", engine="cudf_experimental") + + +def test_json_nested_basic(tmpdir): + fname = tmpdir.mkdir("gdf_json").join("tmp_json_nested_basic") + data = { + "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], + "c2": [["l11", "l21"], ["l12", "l22"]], + } + pdf = pd.DataFrame(data) + pdf.to_json(fname, orient="records") + + with open(fname, "r") as f: + print(f.read()) + print(pdf) + + df = cudf.read_json(fname, engine="cudf_experimental", orient="records") + pdf = pd.read_json(fname, orient="records") + + assert_eq(cudf.DataFrame(pdf), df) From 2de91e18bedb8a97c3cd23648f733eaa170849c7 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 15 Aug 2022 17:12:03 -0700 Subject: [PATCH 052/173] clean up --- cpp/src/io/json/experimental/read_json.cpp | 23 ++++++++-------------- cpp/tests/io/json_test.cpp | 2 +- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index 0c579cbf035..2259fcc839a 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -20,32 +20,25 @@ #include #include +#include + namespace cudf::io::detail::json::experimental { std::vector ingest_raw_input(host_span> sources, compression_type compression) { - // Iterate through the user defined sources and read the contents into the local buffer - size_t total_source_size = 0; - for (const auto& source : sources) { - total_source_size += source->size(); - } - + auto const total_source_size = + std::accumulate(sources.begin(), sources.end(), 0ul, [](size_t sum, auto& source) { + return sum + source->size(); + }); auto buffer = std::vector(total_source_size); size_t bytes_read = 0; for (const auto& source : sources) { - if (not source->is_empty()) { - auto const destination = buffer.data() + bytes_read; - bytes_read += source->host_read(0, source->size(), destination); - } + bytes_read += source->host_read(0, source->size(), buffer.data() + bytes_read); } - if (compression == compression_type::NONE) { - return buffer; - } else { - return decompress(compression, buffer); - } + return (compression == compression_type::NONE) ? buffer : decompress(compression, buffer); } table_with_metadata read_json(host_span> sources, diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 3866def2cdf..4f98dc54a73 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -915,7 +915,7 @@ TEST_F(JsonReaderTest, BadDtypeParams) EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error); } -TEST_F(JsonReaderTest, JsonRecordsBasic) +TEST_F(JsonReaderTest, JsonExperimentalBasic) { const std::string fname = temp_env->get_temp_dir() + "JsonLinesFileTest.json"; std::ofstream outfile(fname, std::ofstream::out); From 70dd9b1c0df226809b84788a133f5f0974b88315 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 15 Aug 2022 23:43:20 -0700 Subject: [PATCH 053/173] stop using column_names --- cpp/src/io/json/nested_json_gpu.cu | 4 +-- cpp/src/io/json/reader_impl.cu | 19 +++++++---- cpp/tests/io/json_test.cpp | 52 +++++++++++++++--------------- python/cudf/cudf/_lib/json.pyx | 2 +- 4 files changed, 40 insertions(+), 37 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index bffc8891020..5e293f8a750 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1395,13 +1395,11 @@ table_with_metadata parse_nested_json(host_span input, // Initialize meta data to be populated while recursing through the tree of columns std::vector> out_columns; std::vector out_column_names; - std::vector out_root_column_names; // Iterate over the struct's child columns and convert to cudf column for (auto const& [col_name, json_col] : root_struct_col.child_columns) { // Insert this columns name into the schema out_column_names.emplace_back(col_name); - out_root_column_names.emplace_back(col_name); // Get this JSON column's cudf column and schema info auto [cudf_col, col_name_info] = json_column_to_cudf_column(json_col, d_input, stream, mr); @@ -1410,7 +1408,7 @@ table_with_metadata parse_nested_json(host_span input, } return table_with_metadata{std::make_unique
(std::move(out_columns)), - {out_root_column_names, out_column_names}}; + {{}, out_column_names}}; } } // namespace detail diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 6b12b462dd9..3be0ff318a1 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -480,7 +480,7 @@ std::vector get_data_types(json_reader_options const& reader_opts, table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, std::vector const& dtypes, - std::vector const& column_names, + std::vector&& column_names, col_map_type* column_map, device_span rec_starts, device_span data, @@ -552,8 +552,8 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, std::vector column_infos; column_infos.reserve(column_names.size()); - std::transform(column_names.cbegin(), - column_names.cend(), + std::transform(std::make_move_iterator(column_names.begin()), + std::make_move_iterator(column_names.end()), std::back_inserter(column_infos), [](auto const& col_name) { return column_name_info{col_name}; }); @@ -563,8 +563,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input"); - return table_with_metadata{std::make_unique
(std::move(out_columns)), - {column_names, column_infos}}; + return table_with_metadata{std::make_unique
(std::move(out_columns)), {{}, column_infos}}; } /** @@ -636,8 +635,14 @@ table_with_metadata read_json(std::vector>& sources, CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n"); - return convert_data_to_table( - parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, d_data, stream, mr); + return convert_data_to_table(parse_opts.view(), + dtypes, + std::move(column_names), + column_map.get(), + rec_starts, + d_data, + stream, + mr); } } // namespace json diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 4f98dc54a73..adf97bf3e2a 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -171,8 +171,8 @@ TEST_F(JsonReaderTest, BasicJsonLines) EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT32); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(result.metadata.column_names[0], "0"); - EXPECT_EQ(result.metadata.column_names[1], "1"); + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].name, "1"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -228,9 +228,9 @@ TEST_F(JsonReaderTest, JsonLinesStrings) EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING); - EXPECT_EQ(result.metadata.column_names[0], "0"); - EXPECT_EQ(result.metadata.column_names[1], "1"); - EXPECT_EQ(result.metadata.column_names[2], "2"); + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].name, "1"); + EXPECT_EQ(result.metadata.schema_info[2].name, "2"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -414,9 +414,9 @@ TEST_F(JsonReaderTest, JsonLinesDtypeInference) EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "0"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "1"); - EXPECT_EQ(std::string(result.metadata.column_names[2]), "2"); + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].name, "1"); + EXPECT_EQ(result.metadata.schema_info[2].name, "2"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -444,8 +444,8 @@ TEST_F(JsonReaderTest, JsonLinesFileInput) EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "0"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "1"); + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].name, "1"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -472,7 +472,7 @@ TEST_F(JsonReaderTest, JsonLinesByteRange) EXPECT_EQ(result.tbl->num_rows(), 3); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "0"); + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -496,9 +496,9 @@ TEST_F(JsonReaderTest, JsonLinesObjects) EXPECT_EQ(result.tbl->num_rows(), 1); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "co\\\"l1"); + EXPECT_EQ(result.metadata.schema_info[0].name, "co\\\"l1"); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "col2"); + EXPECT_EQ(result.metadata.schema_info[1].name, "col2"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -522,9 +522,9 @@ TEST_F(JsonReaderTest, JsonLinesObjectsStrings) EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "col1"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "col2"); - EXPECT_EQ(std::string(result.metadata.column_names[2]), "col3"); + EXPECT_EQ(result.metadata.schema_info[0].name, "col1"); + EXPECT_EQ(result.metadata.schema_info[1].name, "col2"); + EXPECT_EQ(result.metadata.schema_info[2].name, "col3"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -563,9 +563,9 @@ TEST_F(JsonReaderTest, JsonLinesObjectsMissingData) EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING); EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "col2"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "col3"); - EXPECT_EQ(std::string(result.metadata.column_names[2]), "col1"); + EXPECT_EQ(result.metadata.schema_info[0].name, "col2"); + EXPECT_EQ(result.metadata.schema_info[1].name, "col3"); + EXPECT_EQ(result.metadata.schema_info[2].name, "col1"); auto col1_validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; }); @@ -598,9 +598,9 @@ TEST_F(JsonReaderTest, JsonLinesObjectsOutOfOrder) EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "col1"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "col2"); - EXPECT_EQ(std::string(result.metadata.column_names[2]), "col3"); + EXPECT_EQ(result.metadata.schema_info[0].name, "col1"); + EXPECT_EQ(result.metadata.schema_info[1].name, "col2"); + EXPECT_EQ(result.metadata.schema_info[2].name, "col3"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -881,8 +881,8 @@ TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs) EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "0"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "1"); + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].name, "1"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -932,8 +932,8 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic) EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "a"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "b"); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), cudf::test::strings_column_wrapper({"11", "22"})); diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 0ee6062e7f2..376850b7b1b 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -113,7 +113,7 @@ cpdef read_json(object filepaths_or_buffers, with nogil: c_result = move(libcudf_read_json(opts)) - meta_names = [name.decode() for name in c_result.metadata.column_names] + meta_names = [info.name.decode() for info in c_result.metadata.schema_info] df = cudf.DataFrame._from_data(*data_from_unique_ptr( move(c_result.tbl), column_names=meta_names From b1afef0c3ea332c62c4e5fd5e49fa15b14e8a705 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 16 Aug 2022 02:01:20 -0700 Subject: [PATCH 054/173] adds documentation for mr parameter --- cpp/src/io/json/nested_json_gpu.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 34c85402284..4c21b9a78a8 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1009,6 +1009,7 @@ std::pair, rmm::device_uvector> ge * @param[in] input The JSON input in host memory * @param[in] d_input The JSON input in device memory * @param[in] stream The CUDA stream to which kernels are dispatched + * @param[in] mr Optional, resource with which to allocate * @return The columnar representation of the data from the given JSON input */ void make_json_column(json_column& root_column, From 8409214ead5b150122a60b3c1b1db5fcecc59c9e Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 16 Aug 2022 02:04:44 -0700 Subject: [PATCH 055/173] minor documentation fixes --- cpp/src/io/json/nested_json.hpp | 1 - cpp/src/io/json/nested_json_gpu.cu | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 1048f9fcedd..47ce1edafaf 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -285,7 +285,6 @@ void get_stack_context(device_span json_in, * * @param[in] json_in The JSON input * @param[in] options Parsing options specifying the parsing behaviour - * represents the offset within \p d_json_in that cause the input being written * @param[in] stream The CUDA stream to which kernels are dispatched * @param[in] mr Optional, resource with which to allocate * @return Pair of device vectors, where the first vector represents the token types and the second diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 4c21b9a78a8..b51d1270f22 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1009,6 +1009,7 @@ std::pair, rmm::device_uvector> ge * @param[in] input The JSON input in host memory * @param[in] d_input The JSON input in device memory * @param[in] stream The CUDA stream to which kernels are dispatched + * @param[in] options Parsing options specifying the parsing behaviour * @param[in] mr Optional, resource with which to allocate * @return The columnar representation of the data from the given JSON input */ From d0e0defcdaf3c50da6cc13f174a4d55846ec23d2 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 16 Aug 2022 02:06:08 -0700 Subject: [PATCH 056/173] fixes parameter order --- cpp/src/io/json/nested_json_gpu.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index b51d1270f22..26d7aaf3b2b 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1008,8 +1008,8 @@ std::pair, rmm::device_uvector> ge * first node encountered in \p input * @param[in] input The JSON input in host memory * @param[in] d_input The JSON input in device memory - * @param[in] stream The CUDA stream to which kernels are dispatched * @param[in] options Parsing options specifying the parsing behaviour + * @param[in] stream The CUDA stream to which kernels are dispatched * @param[in] mr Optional, resource with which to allocate * @return The columnar representation of the data from the given JSON input */ From 6294dc8bd3e33c2ef66845ef9b8bbb72f9934018 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 16 Aug 2022 03:16:15 -0700 Subject: [PATCH 057/173] adds option to include quote chars for string values --- cpp/src/io/json/nested_json_gpu.cu | 39 +++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index b51d1270f22..4bfbd0b5fbe 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1010,6 +1010,8 @@ std::pair, rmm::device_uvector> ge * @param[in] d_input The JSON input in device memory * @param[in] stream The CUDA stream to which kernels are dispatched * @param[in] options Parsing options specifying the parsing behaviour + * @param[in] include_quote_char Whether to include the original quote chars for string values, + * allowing to distinguish string values from numeric and literal values * @param[in] mr Optional, resource with which to allocate * @return The columnar representation of the data from the given JSON input */ @@ -1018,6 +1020,7 @@ void make_json_column(json_column& root_column, host_span input, device_span d_input, cudf::io::json_reader_options const& options, + bool include_quote_char, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { @@ -1058,14 +1061,30 @@ void make_json_column(json_column& root_column, }; }; - // Skips the quote char if the token is a beginning-of-string or beginning-of-field-name token - auto get_token_index = [](PdaTokenT const token, SymbolOffsetT const token_index) { - constexpr SymbolOffsetT skip_quote_char = 1; - switch (token) { - case token_t::StringBegin: return token_index + skip_quote_char; - case token_t::FieldNameBegin: return token_index + skip_quote_char; - default: return token_index; - }; + // Depending on whether we want to include the quotes of strings or not, respectively, we: + // (a) strip off the beginning quote included in StringBegin and FieldNameBegin or + // (b) include of the end quote excluded from in StringEnd and strip off the beginning quote + // included FieldNameBegin + auto get_token_index = [include_quote_char](PdaTokenT const token, + SymbolOffsetT const token_index) { + constexpr SymbolOffsetT quote_char_size = 1; + if (include_quote_char) { + switch (token) { + // Include trailing quote char for string values excluded for StringEnd + case token_t::StringEnd: return token_index + quote_char_size; + // Strip off quote char included for FieldNameBegin + case token_t::FieldNameBegin: return token_index + quote_char_size; + default: return token_index; + }; + } else { + switch (token) { + // Strip off quote char included for StringBegin + case token_t::StringBegin: return token_index + quote_char_size; + // Strip off quote char included for FieldNameBegin + case token_t::FieldNameBegin: return token_index + quote_char_size; + default: return token_index; + }; + } }; // The end-of-* partner token for a given beginning-of-* token @@ -1497,6 +1516,7 @@ table_with_metadata parse_nested_json(host_span input, constexpr uint32_t token_begin_offset_zero = 0; constexpr uint32_t token_end_offset_zero = 0; constexpr uint32_t node_init_child_count_zero = 0; + constexpr bool include_quote_chars = false; // We initialize the very root node and root column that represents a list column that contains // all the values found at the root "level" of the given JSON string Initialize the root column @@ -1510,7 +1530,8 @@ table_with_metadata parse_nested_json(host_span input, // Push the root node onto the stack for the data path data_path.push({&root_column, row_offset_zero, nullptr, node_init_child_count_zero}); - make_json_column(root_column, data_path, input, d_input, options, stream, mr); + make_json_column( + root_column, data_path, input, d_input, options, include_quote_chars, stream, mr); // data_root refers to the root column of the data represented by the given JSON string auto const& data_root = From 574ac4397e25f05cbb7bdf1d12a1c673c5ecb543 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 16 Aug 2022 09:07:52 -0700 Subject: [PATCH 058/173] fix copy-paste error --- cpp/tests/io/json_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index adf97bf3e2a..c3af9fc2eb0 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -917,7 +917,7 @@ TEST_F(JsonReaderTest, BadDtypeParams) TEST_F(JsonReaderTest, JsonExperimentalBasic) { - const std::string fname = temp_env->get_temp_dir() + "JsonLinesFileTest.json"; + std::string const fname = temp_env->get_temp_dir() + "JsonExperimentalBasic.json"; std::ofstream outfile(fname, std::ofstream::out); outfile << "[{\"a\":\"11\", \"b\":\"1.1\"},{\"a\":\"22\", \"b\":\"2.2\"}]"; outfile.close(); From 2de80b74a42c21a4b4b738e013df22c403125e96 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 16 Aug 2022 09:11:04 -0700 Subject: [PATCH 059/173] raw string Co-authored-by: Elias Stehle <3958403+elstehle@users.noreply.github.com> --- cpp/tests/io/json_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index adf97bf3e2a..79d7bf241f6 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -919,7 +919,7 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic) { const std::string fname = temp_env->get_temp_dir() + "JsonLinesFileTest.json"; std::ofstream outfile(fname, std::ofstream::out); - outfile << "[{\"a\":\"11\", \"b\":\"1.1\"},{\"a\":\"22\", \"b\":\"2.2\"}]"; + outfile << R"([{"a":"11", "b":"1.1"},{"a":"22", "b":"2.2"}])" outfile.close(); cudf_io::json_reader_options options = From bc14a1dd71a12ca11b1964b4ae10f4d3932f374e Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 16 Aug 2022 09:12:23 -0700 Subject: [PATCH 060/173] remove print in Python test --- python/cudf/cudf/tests/test_json.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 5122c976f27..6beb050d920 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -590,10 +590,6 @@ def test_json_nested_basic(tmpdir): pdf = pd.DataFrame(data) pdf.to_json(fname, orient="records") - with open(fname, "r") as f: - print(f.read()) - print(pdf) - df = cudf.read_json(fname, engine="cudf_experimental", orient="records") pdf = pd.read_json(fname, orient="records") From bca2e839d3db8ea70abea83b6d281be613bad9cb Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 16 Aug 2022 09:21:48 -0700 Subject: [PATCH 061/173] addressing reviews --- cpp/src/io/json/experimental/read_json.cpp | 3 ++- python/cudf/cudf/tests/test_json.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index 2259fcc839a..e070aacaca2 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -15,10 +15,11 @@ */ #include "read_json.hpp" + +#include #include #include -#include #include diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 6beb050d920..368015cf563 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -593,4 +593,4 @@ def test_json_nested_basic(tmpdir): df = cudf.read_json(fname, engine="cudf_experimental", orient="records") pdf = pd.read_json(fname, orient="records") - assert_eq(cudf.DataFrame(pdf), df) + assert_eq(pdf, df) From ba28571ca2492f5edcb5b6f76c08ce751146a94f Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 16 Aug 2022 10:57:10 -0700 Subject: [PATCH 062/173] Java fix --- java/src/main/native/src/TableJni.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 44c08aec110..857fac7df2b 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1459,7 +1459,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON( cudf::io::table_with_metadata result = cudf::io::read_json(opts.build()); // there is no need to re-order columns when inferring schema - if (result.metadata.column_names.empty() || n_col_names.size() <= 0) { + if (result.metadata.schema_info.empty() || n_col_names.size() <= 0) { return convert_table_for_return(env, result.tbl); } else { // json reader will not return the correct column order, @@ -1467,10 +1467,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON( // turn name and its index in table into map std::map m; - std::transform(result.metadata.column_names.begin(), result.metadata.column_names.end(), + std::transform(result.metadata.schema_info.cbegin(), result.metadata.schema_info.cend(), thrust::make_counting_iterator(0), std::inserter(m, m.end()), - [](auto const &column_name, auto const &index) { - return std::make_pair(column_name, index); + [](auto const &column_info, auto const &index) { + return std::make_pair(column_info.name, index); }); auto col_names_vec = n_col_names.as_cpp_vector(); From a6d5ab732f6cc474289a088ecb1e1c9287fee728 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 16 Aug 2022 11:59:42 -0700 Subject: [PATCH 063/173] style --- cpp/tests/io/json_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index fcecea8e7e0..67f0542ace2 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -919,7 +919,7 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic) { std::string const fname = temp_env->get_temp_dir() + "JsonExperimentalBasic.json"; std::ofstream outfile(fname, std::ofstream::out); - outfile << R"([{"a":"11", "b":"1.1"},{"a":"22", "b":"2.2"}])" + outfile << R"([{"a":"11", "b":"1.1"},{"a":"22", "b":"2.2"}])"; outfile.close(); cudf_io::json_reader_options options = From a0bd2292f1dcfca9d4b6470c17c0f4b07d85d93f Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 16 Aug 2022 23:28:54 -0700 Subject: [PATCH 064/173] integrates upstream interface changes --- cpp/src/io/json/experimental/read_json.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index e070aacaca2..cc154d5f325 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -57,7 +57,7 @@ table_with_metadata read_json(host_span> sources, auto const buffer = ingest_raw_input(sources, reader_opts.get_compression()); auto data = host_span(reinterpret_cast(buffer.data()), buffer.size()); - return cudf::io::json::detail::parse_nested_json(data, stream, mr); + return cudf::io::json::detail::parse_nested_json(data, reader_opts, stream, mr); } } // namespace cudf::io::detail::json::experimental From f3bba9d4181822704a917b199591ea452bfd46ef Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 17 Aug 2022 10:14:26 -0700 Subject: [PATCH 065/173] enables lines option in the nested reader --- cpp/src/io/json/experimental/read_json.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index cc154d5f325..ceac40ba4f9 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -50,7 +50,6 @@ table_with_metadata read_json(host_span> sources, auto const dtypes_empty = std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); CUDF_EXPECTS(dtypes_empty, "user specified dtypes are not yet supported"); - CUDF_EXPECTS(not reader_opts.is_enabled_lines(), "JSON Lines format is not yet supported"); CUDF_EXPECTS(reader_opts.get_byte_range_offset() == 0 and reader_opts.get_byte_range_size() == 0, "specifying a byte range is not yet supported"); From 21b40231e5c6e6e05c548519b5419df84ffb9a83 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 17 Aug 2022 11:32:15 -0700 Subject: [PATCH 066/173] migrates test from details api to reader api --- cpp/tests/io/json_test.cpp | 27 ++++++++++++++++++++++++ cpp/tests/io/nested_json_test.cpp | 34 ------------------------------- 2 files changed, 27 insertions(+), 34 deletions(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 67f0542ace2..af72edce91b 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -941,4 +942,30 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic) cudf::test::strings_column_wrapper({"1.1", "2.2"})); } +TEST_F(JsonReaderTest, JsonExperimentalLines) +{ + std::string json_string = + R"({"a":"a0"} + {"a":"a1"} + {"a":"a2", "b":"b2"} + {"a":"a3", "c":"c3"} + {"a":"a4"})"; + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.c_str(), json_string.size()}) + .lines(true); + + // Read test data via existing, non-nested json lines reader + cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); + + // Read test data via new, nested json reader + json_lines_options.enable_experimental(true); + cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); + + // Verify that the data read via parquet matches the data read via JSON + CUDF_TEST_EXPECT_TABLES_EQUAL(current_reader_table.tbl->view(), new_reader_table.tbl->view()); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index cae0083daed..7ba7e0a4a03 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -513,37 +513,3 @@ TEST_F(JsonTest, FromParquet) // Verify that the schema read via parquet matches the schema read via JSON cudf::test::expect_metadata_equal(cudf_table.metadata, result.metadata); } - -TEST_F(JsonTest, JsonLines) -{ - // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); - - // Default parsing options - cudf::io::json_reader_options json_lines_options = - cudf::io::json_reader_options_builder{}.lines(true); - - using cuio_json::SymbolT; - - std::string json_string = - R"({"a":"a0"} - {"a":"a1"} - {"a":"a2", "b":"b2"} - {"a":"a3", "c":"c3"} - {"a":"a4"})"; - - cudf::io::json_reader_options in_options = - cudf::io::json_reader_options::builder( - cudf::io::source_info{json_string.c_str(), json_string.size()}) - .lines(true); - cudf::io::table_with_metadata old_reader_table = cudf::io::read_json(in_options); - - auto const new_reader_table = cuio_json::detail::parse_nested_json( - cudf::host_span{json_string.data(), json_string.size()}, - json_lines_options, - stream_view); - - // Verify that the data read via parquet matches the data read via JSON - CUDF_TEST_EXPECT_TABLES_EQUAL(old_reader_table.tbl->view(), new_reader_table.tbl->view()); -} From cdc44411a385fb71da6954a98ebd2a59944fcf0a Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 17 Aug 2022 11:58:45 -0700 Subject: [PATCH 067/173] improves code comment --- cpp/src/io/json/nested_json_gpu.cu | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 26d7aaf3b2b..07348e67b6c 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1498,12 +1498,13 @@ table_with_metadata parse_nested_json(host_span input, constexpr uint32_t token_end_offset_zero = 0; constexpr uint32_t node_init_child_count_zero = 0; - // We initialize the very root node and root column that represents a list column that contains - // all the values found at the root "level" of the given JSON string Initialize the root column - // For JSON lines: we expect to find a list of values that all will be inserted into this list + // We initialize the very root node and root column, which represent the JSON document being + // parsed. That root node is a list node and that root column is a list column. The column has the + // root node as its only row. The values parsed from the JSON input will be treated as follows: + // (1) For JSON lines: we expect to find a list of JSON values that all + // will be inserted into this root list column. (2) For regular JSON: we expect to have only a + // single value (list, struct, string, number, literal) that will be inserted into this root // column. - // For regular JSON: we expect to have only a single value (single row) that will be inserted into - // this column root_column.append_row( row_offset_zero, json_col_t::ListColumn, token_begin_offset_zero, token_end_offset_zero, 1); From 7479b63c5095e01bd4c6e51e5089432d52113ad6 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 19 Aug 2022 03:16:23 -0700 Subject: [PATCH 068/173] adds inference and type conversion --- cpp/src/io/json/nested_json.hpp | 1 + cpp/src/io/json/nested_json_gpu.cu | 97 ++++++++++++++++++++++-------- cpp/tests/io/json_test.cpp | 24 ++++++++ 3 files changed, 98 insertions(+), 24 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 47ce1edafaf..cf8971ee588 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -125,6 +125,7 @@ struct json_column { // Following "items" as the default child column's name of a list column // Using the struct's field names std::map child_columns; + std::vector column_order; // Counting the current number of items in this column row_offset_t current_offset = 0; diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 8bec1993158..17f44a980d5 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -18,7 +18,10 @@ #include #include +#include #include +#include +#include #include #include @@ -34,6 +37,7 @@ #include #include +#include #include #include @@ -1167,6 +1171,7 @@ void make_json_column(json_column& root_column, if (current_data_path.top().column->child_columns.size() == 0) { current_data_path.top().column->child_columns.emplace(std::string{list_child_name}, json_column{json_col_t::Unknown}); + current_data_path.top().column->column_order.push_back(list_child_name); } current_data_path.top().current_selected_col = ¤t_data_path.top().column->child_columns.begin()->second; @@ -1206,6 +1211,7 @@ void make_json_column(json_column& root_column, // The field name's column does not exist yet, so we have to append the child column to the // struct column + struct_col->column_order.push_back(field_name); return &struct_col->child_columns.emplace(field_name, json_column{}).first->second; }; @@ -1402,16 +1408,37 @@ void make_json_column(json_column& root_column, root_column.level_child_cols_recursively(root_column.current_offset); } +auto default_json_options() +{ + auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; + + auto const stream = rmm::cuda_stream_default; + parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + return parse_opts; +} + +auto default_inference_options() +{ + fst::detail::inference_options parse_opts{}; + + auto const stream = rmm::cuda_stream_default; + parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + return parse_opts; +} + std::pair, std::vector> json_column_to_cudf_column( json_column const& json_col, device_span d_input, + cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto make_validity = [stream, mr](json_column const& json_col) -> std::pair { - if (json_col.current_offset == json_col.valid_count) { return {rmm::device_buffer{}, 0}; } - return {rmm::device_buffer{json_col.validity.data(), bitmask_allocation_size_bytes(json_col.current_offset), stream, @@ -1421,29 +1448,48 @@ std::pair, std::vector> json_column_to switch (json_col.type) { case json_col_t::StringColumn: { - // move string_offsets to GPU and transform to string column - auto const col_size = json_col.string_offsets.size(); - using char_length_pair_t = thrust::pair; + auto const col_size = json_col.string_offsets.size(); CUDF_EXPECTS(json_col.string_offsets.size() == json_col.string_lengths.size(), "string offset, string length mismatch"); - rmm::device_uvector d_string_data(col_size, stream); + + // Move string_offsets and string_lengths to GPU rmm::device_uvector d_string_offsets = cudf::detail::make_device_uvector_async(json_col.string_offsets, stream); rmm::device_uvector d_string_lengths = cudf::detail::make_device_uvector_async(json_col.string_lengths, stream); + + // Prepare iterator that returns (string_offset, string_length)-tuples auto offset_length_it = thrust::make_zip_iterator(d_string_offsets.begin(), d_string_lengths.begin()); - thrust::transform(rmm::exec_policy(stream), - offset_length_it, - offset_length_it + col_size, - d_string_data.data(), - [data = d_input.data()] __device__(auto ip) { - return char_length_pair_t{data + thrust::get<0>(ip), thrust::get<1>(ip)}; - }); - auto str_col_ptr = make_strings_column(d_string_data, stream, mr); - auto [result_bitmask, null_count] = make_validity(json_col); - str_col_ptr->set_null_mask(result_bitmask, null_count); - return {std::move(str_col_ptr), {{"offsets"}, {"chars"}}}; + + // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference + auto string_ranges_it = + thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) { + return thrust::pair{ + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; + }); + + // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion + auto string_spans_it = thrust::make_transform_iterator( + offset_length_it, [data = d_input.data()] __device__(auto ip) { + return thrust::pair{ + data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; + }); + + // Infer column type + auto target_type = fst::detail::detect_data_type( + default_inference_options().view(), d_input, string_ranges_it, col_size, stream); + + // Convert strings to the inferred data type + auto col = cudf::io::json::experimental::parse_data(string_spans_it, + col_size, + target_type, + make_validity(json_col).first, + default_json_options().view(), + stream, + mr); + + return {std::move(col), {{"offsets"}, {"chars"}}}; break; } case json_col_t::StructColumn: { @@ -1453,8 +1499,9 @@ std::pair, std::vector> json_column_to // Create children columns for (auto const& col : json_col.child_columns) { column_names.emplace_back(col.first); - auto const& child_col = col.second; - auto [child_column, names] = json_column_to_cudf_column(child_col, d_input, stream, mr); + auto const& child_col = col.second; + auto [child_column, names] = + json_column_to_cudf_column(child_col, d_input, options, stream, mr); CUDF_EXPECTS(num_rows == child_column->size(), "All children columns must have the same size"); child_columns.push_back(std::move(child_column)); @@ -1478,8 +1525,8 @@ std::pair, std::vector> json_column_to auto offsets_column = std::make_unique(data_type{type_id::INT32}, num_rows, d_offsets.release()); // Create children column - auto [child_column, names] = - json_column_to_cudf_column(json_col.child_columns.begin()->second, d_input, stream, mr); + auto [child_column, names] = json_column_to_cudf_column( + json_col.child_columns.begin()->second, d_input, options, stream, mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); return {make_lists_column(num_rows - 1, @@ -1516,7 +1563,7 @@ table_with_metadata parse_nested_json(host_span input, constexpr uint32_t token_begin_offset_zero = 0; constexpr uint32_t token_end_offset_zero = 0; constexpr uint32_t node_init_child_count_zero = 0; - constexpr bool include_quote_chars = false; + constexpr bool include_quote_chars = true; // We initialize the very root node and root column, which represent the JSON document being // parsed. That root node is a list node and that root column is a list column. The column has the @@ -1553,12 +1600,14 @@ table_with_metadata parse_nested_json(host_span input, std::vector out_column_names; // Iterate over the struct's child columns and convert to cudf column - for (auto const& [col_name, json_col] : root_struct_col.child_columns) { + for (auto const& col_name : root_struct_col.column_order) { + auto const& json_col = root_struct_col.child_columns.find(col_name)->second; // Insert this columns name into the schema out_column_names.emplace_back(col_name); // Get this JSON column's cudf column and schema info - auto [cudf_col, col_name_info] = json_column_to_cudf_column(json_col, d_input, stream, mr); + auto [cudf_col, col_name_info] = + json_column_to_cudf_column(json_col, d_input, options, stream, mr); out_column_names.back().children = std::move(col_name_info); out_columns.emplace_back(std::move(cudf_col)); } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index af72edce91b..4afa9c094c4 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -968,4 +969,27 @@ TEST_F(JsonReaderTest, JsonExperimentalLines) CUDF_TEST_EXPECT_TABLES_EQUAL(current_reader_table.tbl->view(), new_reader_table.tbl->view()); } +TEST_F(JsonReaderTest, JsonExperimentalLarge) +{ + // std::string json_path = "/raid/estehle/rapids/cudf/large.json"; + std::string json_path = "/raid/estehle/rapids/cudf/large32x.json"; + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{json_path}).lines(true); + + // Read test data via existing, non-nested json lines reader + cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); + + // Read test data via new, nested json reader + json_lines_options.enable_experimental(true); + cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); + + // Verify that the data read via parquet matches the data read via JSON + CUDF_TEST_EXPECT_TABLES_EQUAL(current_reader_table.tbl->view(), new_reader_table.tbl->view()); + + // TODO enable once existing JSON lines reader's schema generation has been adjusted + // cudf::test::expect_metadata_equal(current_reader_table.metadata, new_reader_table.metadata); +} + CUDF_TEST_PROGRAM_MAIN() From a6598170bc0c22380910335169fd4d212f2a74f2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 19 Aug 2022 10:18:10 -0400 Subject: [PATCH 069/173] Move type inference to utilities --- cpp/src/io/{fst => utilities}/type_inference.cuh | 2 -- 1 file changed, 2 deletions(-) rename cpp/src/io/{fst => utilities}/type_inference.cuh (99%) diff --git a/cpp/src/io/fst/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh similarity index 99% rename from cpp/src/io/fst/type_inference.cuh rename to cpp/src/io/utilities/type_inference.cuh index 5bf687e106e..dc615cba4ca 100644 --- a/cpp/src/io/fst/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -32,7 +32,6 @@ namespace cudf { namespace io { -namespace fst { namespace detail { /** * @brief Structure for type inference options @@ -245,6 +244,5 @@ cudf::data_type detect_data_type(inference_options_view const& options, return cudf::data_type{get_type_id(h_column_info)}; } } // namespace detail -} // namespace fst } // namespace io } // namespace cudf From 441048894bee6b6ba3e809bad7763b792ea33efe Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 19 Aug 2022 10:29:41 -0400 Subject: [PATCH 070/173] Resolve conflicts + relocate type inference test file --- cpp/src/io/json/nested_json.h | 165 -------- cpp/tests/CMakeLists.txt | 3 +- cpp/tests/io/nested_json_test.cu | 360 ------------------ cpp/tests/io/{fst => }/type_inference_test.cu | 6 +- 4 files changed, 5 insertions(+), 529 deletions(-) delete mode 100644 cpp/src/io/json/nested_json.h delete mode 100644 cpp/tests/io/nested_json_test.cu rename cpp/tests/io/{fst => }/type_inference_test.cu (93%) diff --git a/cpp/src/io/json/nested_json.h b/cpp/src/io/json/nested_json.h deleted file mode 100644 index 69c6c63ea52..00000000000 --- a/cpp/src/io/json/nested_json.h +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -namespace cudf { -namespace io { -namespace json { -namespace gpu { - -/// Type used to represent the atomic symbol type used within the finite-state machine -using SymbolT = char; - -/// Type used to represent the stack alphabet (i.e.: empty-stack, struct, list) -using StackSymbolT = char; - -/// Type used to index into the symbols within the JSON input -using SymbolOffsetT = uint32_t; - -/// Type large enough to support indexing up to max nesting level (must be signed) -using StackLevelT = int8_t; - -/// Type used to represent a symbol group id of the input alphabet in the pushdown automaton -using PdaInputSymbolGroupIdT = char; - -/// Type used to represent a symbol group id of the stack alphabet in the pushdown automaton -using PdaStackSymbolGroupIdT = char; - -/// Type used to represent a (input-symbol, stack-symbole)-tuple in stack-symbole-major order -using PdaSymbolGroupIdT = char; - -/// Type being emitted by the pushdown automaton transducer -using PdaTokenT = char; - -/// Type used to represent the class of a node (or a node "category") within the tree representation -using NodeT = char; - -/// Type used to index into the nodes within the tree of structs, lists, field names, and value -/// nodes -using NodeIndexT = uint32_t; - -/// Type large enough to represent tree depth from [0, max-tree-depth); may be an unsigned type -using TreeDepthT = StackLevelT; - -using tree_meta_t = std::tuple, - std::vector, - std::vector, - std::vector, - std::vector>; - -constexpr NodeIndexT parent_node_sentinel = std::numeric_limits::max(); - -/** - * @brief Tokens emitted while parsing a JSON input - */ -enum token_t : PdaTokenT { - /// Beginning-of-struct token (on encounter of semantic '{') - TK_BOS, - /// Beginning-of-list token (on encounter of semantic '[') - TK_BOL, - /// Beginning-of-error token (on first encounter of a parsing error) - TK_ERR, - /// Beginning-of-string-value token (on encounter of the string's first quote) - TK_BST, - /// Beginning-of-value token (first character of literal or numeric) - TK_BOV, - /// End-of-list token (on encounter of semantic ']') - TK_EOL, - /// End-of-struct token (on encounter of semantic '}') - TK_EOS, - /// Beginning-of-field-name token (on encounter of first quote) - TK_BFN, - /// Post-value token (first character after a literal or numeric string) - TK_POV, - /// End-of-string token (on encounter of a string's second quote) - TK_EST, - /// End-of-field-name token (on encounter of a field name's second quote) - TK_EFN, - /// Total number of tokens - NUM_TOKENS -}; - -/** - * @brief Class of a node (or a node "category") within the tree representation - */ -enum node_t : NodeT { - /// A node representing a struct - NC_STRUCT, - /// A node representing a list - NC_LIST, - /// A node representing a field name - NC_FN, - /// A node representing a string value - NC_STR, - /// A node representing a numeric or literal value (e.g., true, false, null) - NC_VAL, - /// A node representing a parser error - NC_ERR, - /// Total number of node classes - NUM_NODE_CLASSES -}; - -/** - * @brief Identifies the stack context for each character from a JSON input. Specifically, we - * identify brackets and braces outside of quoted fields (e.g., field names, strings). - * At this stage, we do not perform bracket matching, i.e., we do not verify whether a closing - * bracket would actually pop a the corresponding opening brace. - * - * @param d_json_in The string of input characters - * @param d_top_of_stack - * @param stream The cuda stream to dispatch GPU kernels to - */ -void get_stack_context(device_span d_json_in, - device_span d_top_of_stack, - rmm::cuda_stream_view stream); - -/** - * @brief Parses the given JSON string and emits a sequence of tokens that demarcate relevant - * sections from the input. - * - * @param d_json_in The JSON input - * @param d_tokens_out Device memory to which the parsed tokens are written - * @param d_tokens_indices Device memory to which the indices are written, where each index - * represents the offset within \p d_json_in that cause the input being written - * @param d_num_written_tokens The total number of tokens that were parsed - * @param stream The CUDA stream to which kernels are dispatched - */ -void get_token_stream(device_span d_json_in, - device_span d_tokens, - device_span d_tokens_indices, - SymbolOffsetT* d_num_written_tokens, - rmm::cuda_stream_view stream); - -/** - * @briefTakes a JSON input in host memory and returns the tree representation of the JSON input. - * Specifically, the host-side JSON input is transferred to the GPU, where the JSON tokenizer is - * run. The token stream is then copied back to the CPU where the tree representation is computed. - * - * @param input The JSON input - * @param stream The CUDA stream to which kernels and memcpy'ies are dispatched - * @return Returns a tree representation of the JSON input on the host - */ -tree_meta_t get_tree_representation(host_span input, rmm::cuda_stream_view stream); - -} // namespace gpu -} // namespace json -} // namespace io -} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 4091da6727b..4d40dc709ba 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -226,7 +226,8 @@ ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu) -ConfigureTest(FST_TEST io/fst/fst_test.cu io/fst/type_inference_test.cu) +ConfigureTest(FST_TEST io/fst/fst_test.cu) +ConfigureTest(TYPE_INFERENCE_TEST io/type_inference_test.cu) if(CUDF_ENABLE_ARROW_S3) target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED") endif() diff --git a/cpp/tests/io/nested_json_test.cu b/cpp/tests/io/nested_json_test.cu deleted file mode 100644 index e11727f01c6..00000000000 --- a/cpp/tests/io/nested_json_test.cu +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include - -#include -#include - -namespace nested_json = cudf::io::json::gpu; - -// Base test fixture for tests -struct JsonTest : public cudf::test::BaseFixture { -}; - -TEST_F(JsonTest, StackContext) -{ - // Type used to represent the atomic symbol type used within the finite-state machine - using SymbolT = char; - using StackSymbolT = char; - - // Prepare cuda stream for data transfers & kernels - cudaStream_t stream = nullptr; - cudaStreamCreate(&stream); - rmm::cuda_stream_view stream_view(stream); - - // Test input - std::string input = R"( [{)" - R"("category": "reference",)" - R"("index:": [4,12,42],)" - R"("author": "Nigel Rees",)" - R"("title": "[Sayings of the Century]",)" - R"("price": 8.95)" - R"(}, )" - R"({)" - R"("category": "reference",)" - R"("index": [4,{},null,{"a":[{ }, {}] } ],)" - R"("author": "Nigel Rees",)" - R"("title": "{}[], <=semantic-symbols-string",)" - R"("price": 8.95)" - R"(}] )"; - - // Prepare input & output buffers - rmm::device_uvector d_input(input.size(), stream_view); - hostdevice_vector stack_context(input.size(), stream_view); - - ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( - d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); - - // Run algorithm - cudf::io::json::gpu::get_stack_context( - d_input, - cudf::device_span{stack_context.device_ptr(), stack_context.size()}, - stream); - - // Copy back the results - stack_context.device_to_host(stream); - - // Make sure we copied back the stack context - stream_view.synchronize(); - - std::vector golden_stack_context{ - '_', '_', '_', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', - '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', - '{', '{', '{', '[', '[', '[', '[', '[', '[', '[', '[', '{', '{', '{', '{', '{', '{', '{', - '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', - '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', - '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', - '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '[', '[', '[', '{', - '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', - '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '[', '[', - '{', '[', '[', '[', '[', '[', '[', '[', '{', '{', '{', '{', '{', '[', '{', '{', '[', '[', - '[', '{', '[', '{', '{', '[', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', - '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', - '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', - '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', - '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '_'}; - - ASSERT_EQ(golden_stack_context.size(), stack_context.size()); - for (std::size_t i = 0; i < stack_context.size() && i < 1000; i++) { - ASSERT_EQ(golden_stack_context[i], stack_context[i]); - } -} - -TEST_F(JsonTest, TokenStream) -{ - using cudf::io::json::gpu::PdaTokenT; - using cudf::io::json::gpu::SymbolOffsetT; - using cudf::io::json::gpu::SymbolT; - - constexpr std::size_t single_item = 1; - - // Prepare cuda stream for data transfers & kernels - cudaStream_t stream = nullptr; - cudaStreamCreate(&stream); - rmm::cuda_stream_view stream_view(stream); - - // Test input - std::string input = R"( [{)" - R"("category": "reference",)" - R"("index:": [4,12,42],)" - R"("author": "Nigel Rees",)" - R"("title": "[Sayings of the Century]",)" - R"("price": 8.95)" - R"(}, )" - R"({)" - R"("category": "reference",)" - R"("index": [4,{},null,{"a":[{ }, {}] } ],)" - R"("author": "Nigel Rees",)" - R"("title": "{}[], <=semantic-symbols-string",)" - R"("price": 8.95)" - R"(}] )"; - - // Prepare input & output buffers - rmm::device_uvector d_input(input.size(), stream_view); - - ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( - d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); - - hostdevice_vector tokens_gpu{input.size(), stream}; - hostdevice_vector token_indices_gpu{input.size(), stream}; - hostdevice_vector num_tokens_out{single_item, stream}; - - // Parse the JSON and get the token stream - cudf::io::json::gpu::get_token_stream( - d_input, - cudf::device_span{tokens_gpu.device_ptr(), tokens_gpu.size()}, - cudf::device_span{token_indices_gpu.device_ptr(), token_indices_gpu.size()}, - num_tokens_out.device_ptr(), - stream); - - // Copy back the number of tokens that were written - num_tokens_out.device_to_host(stream); - tokens_gpu.device_to_host(stream); - token_indices_gpu.device_to_host(stream); - - // Make sure we copied back all relevant data - stream_view.synchronize(); - - // Golden token stream sample - std::vector> golden_token_stream = { - {2, nested_json::TK_BOL}, {3, nested_json::TK_BOS}, {4, nested_json::TK_BFN}, - {13, nested_json::TK_EFN}, {16, nested_json::TK_BST}, {26, nested_json::TK_EST}, - {28, nested_json::TK_BFN}, {35, nested_json::TK_EFN}, {38, nested_json::TK_BOL}, - {39, nested_json::TK_BOV}, {40, nested_json::TK_POV}, {41, nested_json::TK_BOV}, - {43, nested_json::TK_POV}, {44, nested_json::TK_BOV}, {46, nested_json::TK_POV}, - {46, nested_json::TK_EOL}, {48, nested_json::TK_BFN}, {55, nested_json::TK_EFN}, - {58, nested_json::TK_BST}, {69, nested_json::TK_EST}, {71, nested_json::TK_BFN}, - {77, nested_json::TK_EFN}, {80, nested_json::TK_BST}, {105, nested_json::TK_EST}, - {107, nested_json::TK_BFN}, {113, nested_json::TK_EFN}, {116, nested_json::TK_BOV}, - {120, nested_json::TK_POV}, {120, nested_json::TK_EOS}, {124, nested_json::TK_BOS}, - {125, nested_json::TK_BFN}, {134, nested_json::TK_EFN}, {137, nested_json::TK_BST}, - {147, nested_json::TK_EST}, {149, nested_json::TK_BFN}, {155, nested_json::TK_EFN}, - {158, nested_json::TK_BOL}, {159, nested_json::TK_BOV}, {160, nested_json::TK_POV}, - {161, nested_json::TK_BOS}, {162, nested_json::TK_EOS}, {164, nested_json::TK_BOV}, - {168, nested_json::TK_POV}, {169, nested_json::TK_BOS}, {170, nested_json::TK_BFN}, - {172, nested_json::TK_EFN}, {174, nested_json::TK_BOL}, {175, nested_json::TK_BOS}, - {177, nested_json::TK_EOS}, {180, nested_json::TK_BOS}, {181, nested_json::TK_EOS}, - {182, nested_json::TK_EOL}, {184, nested_json::TK_EOS}, {186, nested_json::TK_EOL}, - {188, nested_json::TK_BFN}, {195, nested_json::TK_EFN}, {198, nested_json::TK_BST}, - {209, nested_json::TK_EST}, {211, nested_json::TK_BFN}, {217, nested_json::TK_EFN}, - {220, nested_json::TK_BST}, {252, nested_json::TK_EST}, {254, nested_json::TK_BFN}, - {260, nested_json::TK_EFN}, {263, nested_json::TK_BOV}, {267, nested_json::TK_POV}, - {267, nested_json::TK_EOS}, {268, nested_json::TK_EOL}}; - - // Verify the number of tokens matches - ASSERT_EQ(golden_token_stream.size(), num_tokens_out[0]); - - for (std::size_t i = 0; i < num_tokens_out[0]; i++) { - // Ensure the index the tokens are pointing to do match - ASSERT_EQ(golden_token_stream[i].first, token_indices_gpu[i]); - // Ensure the token category is correct - ASSERT_EQ(golden_token_stream[i].second, tokens_gpu[i]); - } -} - -std::string get_node_string(std::size_t const node_id, - nested_json::tree_meta_t const& tree_rep, - std::string const& json_input) -{ - auto const& node_categories = std::get<0>(tree_rep); - auto const& parent_node_ids = std::get<1>(tree_rep); - auto const& node_levels = std::get<2>(tree_rep); - auto const& node_range_begin = std::get<3>(tree_rep); - auto const& node_range_end = std::get<4>(tree_rep); - - auto node_to_str = [] __host__ __device__(nested_json::PdaTokenT const token) { - switch (token) { - case nested_json::NC_STRUCT: return "STRUCT"; - case nested_json::NC_LIST: return "LIST"; - case nested_json::NC_FN: return "FN"; - case nested_json::NC_STR: return "STR"; - case nested_json::NC_VAL: return "VAL"; - case nested_json::NC_ERR: return "ERR"; - default: return "N/A"; - }; - }; - - return "<" + std::to_string(node_id) + ":" + node_to_str(node_categories[node_id]) + ":[" + - std::to_string(node_range_begin[node_id]) + ", " + - std::to_string(node_range_end[node_id]) + ") '" + - json_input.substr(node_range_begin[node_id], - node_range_end[node_id] - node_range_begin[node_id]) + - "'>"; -} - -void print_tree_representation(std::string const& json_input, - nested_json::tree_meta_t const& tree_rep) -{ - for (std::size_t i = 0; i < std::get<0>(tree_rep).size(); i++) { - auto const& parent_node_ids = std::get<1>(tree_rep); - std::size_t parent_id = parent_node_ids[i]; - std::stack path; - path.push(i); - while (parent_id != nested_json::parent_node_sentinel) { - path.push(parent_id); - parent_id = parent_node_ids[parent_id]; - } - - while (path.size()) { - auto const node_id = path.top(); - std::cout << get_node_string(node_id, tree_rep, json_input) - << (path.size() > 1 ? " -> " : ""); - path.pop(); - } - std::cout << "\n"; - } -} - -TEST_F(JsonTest, TreeRepresentation) -{ - using nested_json::PdaTokenT; - using nested_json::SymbolOffsetT; - using nested_json::SymbolT; - - // Prepare cuda stream for data transfers & kernels - cudaStream_t stream = nullptr; - cudaStreamCreate(&stream); - rmm::cuda_stream_view stream_view(stream); - - // Test input - std::string input = R"( [{)" - R"("category": "reference",)" - R"("index:": [4,12,42],)" - R"("author": "Nigel Rees",)" - R"("title": "[Sayings of the Century]",)" - R"("price": 8.95)" - R"(}, )" - R"({)" - R"("category": "reference",)" - R"("index": [4,{},null,{"a":[{ }, {}] } ],)" - R"("author": "Nigel Rees",)" - R"("title": "{}[], <=semantic-symbols-string",)" - R"("price": 8.95)" - R"(}] )"; - - // Get the JSON's tree representation - auto tree_rep = nested_json::get_tree_representation( - cudf::host_span{input.data(), input.size()}, stream_view); - - auto const& node_categories = std::get<0>(tree_rep); - auto const& parent_node_ids = std::get<1>(tree_rep); - auto const& node_levels = std::get<2>(tree_rep); - auto const& node_range_begin = std::get<3>(tree_rep); - auto const& node_range_end = std::get<4>(tree_rep); - - // Golden sample of node categories - std::vector golden_node_categories = { - nested_json::NC_LIST, nested_json::NC_STRUCT, nested_json::NC_FN, nested_json::NC_STR, - nested_json::NC_FN, nested_json::NC_LIST, nested_json::NC_VAL, nested_json::NC_VAL, - nested_json::NC_VAL, nested_json::NC_FN, nested_json::NC_STR, nested_json::NC_FN, - nested_json::NC_STR, nested_json::NC_FN, nested_json::NC_VAL, nested_json::NC_STRUCT, - nested_json::NC_FN, nested_json::NC_STR, nested_json::NC_FN, nested_json::NC_LIST, - nested_json::NC_VAL, nested_json::NC_STRUCT, nested_json::NC_VAL, nested_json::NC_STRUCT, - nested_json::NC_FN, nested_json::NC_LIST, nested_json::NC_STRUCT, nested_json::NC_STRUCT, - nested_json::NC_FN, nested_json::NC_STR, nested_json::NC_FN, nested_json::NC_STR, - nested_json::NC_FN, nested_json::NC_VAL}; - - // Golden sample of node ids - std::vector golden_parent_node_ids = {nested_json::parent_node_sentinel, - 0, - 1, - 2, - 1, - 4, - 5, - 5, - 5, - 1, - 9, - 1, - 11, - 1, - 13, - 0, - 15, - 16, - 15, - 18, - 19, - 19, - 19, - 19, - 23, - 24, - 25, - 25, - 15, - 28, - 15, - 30, - 15, - 32}; - - // Golden sample of node levels - std::vector golden_node_levels = {0, 1, 2, 3, 2, 3, 4, 4, 4, 2, 3, 2, - 3, 2, 3, 1, 2, 3, 2, 3, 4, 4, 4, 4, - 5, 6, 7, 7, 2, 3, 2, 3, 2, 3}; - - // Golden sample of the character-ranges from the original input that each node demarcates - std::vector golden_node_range_begin = { - 2, 3, 5, 17, 29, 38, 39, 41, 44, 49, 59, 72, 81, 108, 116, 124, 126, - 138, 150, 158, 159, 161, 164, 169, 171, 174, 175, 180, 189, 199, 212, 221, 255, 263}; - - // Golden sample of the character-ranges from the original input that each node demarcates - std::vector golden_node_range_end = { - 3, 4, 13, 26, 35, 39, 40, 43, 46, 55, 69, 77, 105, 113, 120, 125, 134, - 147, 155, 159, 160, 162, 168, 170, 172, 175, 176, 181, 195, 209, 217, 252, 260, 267}; - - // Check results against golden samples - ASSERT_EQ(golden_node_categories.size(), node_categories.size()); - ASSERT_EQ(golden_parent_node_ids.size(), parent_node_ids.size()); - ASSERT_EQ(golden_node_levels.size(), node_levels.size()); - ASSERT_EQ(golden_node_range_begin.size(), node_range_begin.size()); - ASSERT_EQ(golden_node_range_end.size(), node_range_end.size()); - - for (std::size_t i = 0; i < golden_node_categories.size(); i++) { - ASSERT_EQ(golden_node_categories[i], node_categories[i]); - ASSERT_EQ(golden_parent_node_ids[i], parent_node_ids[i]); - ASSERT_EQ(golden_node_levels[i], node_levels[i]); - ASSERT_EQ(golden_node_range_begin[i], node_range_begin[i]); - ASSERT_EQ(golden_node_range_end[i], node_range_end[i]); - } -} diff --git a/cpp/tests/io/fst/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu similarity index 93% rename from cpp/tests/io/fst/type_inference_test.cu rename to cpp/tests/io/type_inference_test.cu index 16ac0fd5211..17925c33b9f 100644 --- a/cpp/tests/io/fst/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -14,9 +14,9 @@ * limitations under the License. */ -#include #include #include +#include #include @@ -27,8 +27,8 @@ #include #include -using cudf::io::fst::detail::detect_data_type; -using cudf::io::fst::detail::inference_options; +using cudf::io::detail::detect_data_type; +using cudf::io::detail::inference_options; // Base test fixture for tests struct TypeInference : public cudf::test::BaseFixture { From 6409a5f0037a2606b726ea342b7896c3f95055ce Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 19 Aug 2022 10:49:23 -0400 Subject: [PATCH 071/173] Get rid of narrow conversion + add string handling --- cpp/src/io/utilities/type_inference.cuh | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index dc615cba4ca..e2baec4d841 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -40,18 +40,21 @@ struct inference_options_view { cudf::detail::trie_view trie_true; cudf::detail::trie_view trie_false; cudf::detail::trie_view trie_na; + char quote_char; }; struct inference_options { cudf::detail::optional_trie trie_true; cudf::detail::optional_trie trie_false; cudf::detail::optional_trie trie_na; + char quote_char; [[nodiscard]] inference_options_view view() const { return {cudf::detail::make_trie_view(trie_true), cudf::detail::make_trie_view(trie_false), - cudf::detail::make_trie_view(trie_na)}; + cudf::detail::make_trie_view(trie_na), + quote_char}; } }; @@ -114,11 +117,19 @@ __global__ void detect_column_type_kernel(inference_options_view const options, while (idx < size) { auto const [field_offset, field_len] = *(column_strings_begin + idx); auto const field_begin = data.begin() + field_offset; - if (cudf::detail::serialized_trie_contains(options.trie_na, {field_begin, field_len})) { + if (cudf::detail::serialized_trie_contains( + options.trie_na, {field_begin, static_cast(field_len)})) { atomicAdd(&column_info->null_count, 1); continue; } + // Handling strings + if (field_len == 0) continue; + if (*field_begin == options.quote_char && field_begin[field_len - 1] == options.quote_char) { + atomicAdd(&column_info->string_count, 1); + continue; + } + // No need to check strings since it's inferred in the tree generation int digit_count = 0; int decimal_count = 0; @@ -160,8 +171,10 @@ __global__ void detect_column_type_kernel(inference_options_view const options, if ((*field_begin == '-' || *field_begin == '+') && field_len > 1) { --int_req_number_cnt; } // Off by one if they are a hexadecimal number if (maybe_hex) { --int_req_number_cnt; } - if (cudf::detail::serialized_trie_contains(options.trie_true, {field_begin, field_len}) || - cudf::detail::serialized_trie_contains(options.trie_false, {field_begin, field_len})) { + if (cudf::detail::serialized_trie_contains( + options.trie_true, {field_begin, static_cast(field_len)}) || + cudf::detail::serialized_trie_contains( + options.trie_false, {field_begin, static_cast(field_len)})) { atomicAdd(&column_info->bool_count, 1); } else if (digit_count == int_req_number_cnt) { bool is_negative = (*field_begin == '-'); @@ -224,7 +237,7 @@ cudf::data_type detect_data_type(inference_options_view const& options, // Entire column is NULL; allocate the smallest amount of memory return type_id::INT8; } else if (cinfo.string_count > 0) { - CUDF_FAIL("Unexpected string type in type inference."); + return type_id::STRING; } else if (cinfo.datetime_count > 0) { return type_id::TIMESTAMP_MILLISECONDS; } else if (cinfo.float_count > 0 || (int_count_total > 0 && cinfo.null_count > 0)) { From ec07bcacd07b0d8d5e1744c6cfc01304b4d6eb09 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 19 Aug 2022 11:27:17 -0400 Subject: [PATCH 072/173] Updates: make column string iter compatible with zip iterator --- cpp/src/io/utilities/type_inference.cuh | 7 +++++-- cpp/tests/io/type_inference_test.cu | 15 ++++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index e2baec4d841..bf7058c7435 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -27,6 +27,7 @@ #include #include +#include #include @@ -115,8 +116,10 @@ __global__ void detect_column_type_kernel(inference_options_view const options, auto idx = threadIdx.x + blockDim.x * blockIdx.x; while (idx < size) { - auto const [field_offset, field_len] = *(column_strings_begin + idx); - auto const field_begin = data.begin() + field_offset; + // auto const [field_offset, field_len] = *(column_strings_begin + idx); + auto const field_offset = thrust::get<0>(*(column_strings_begin + idx)); + auto const field_len = thrust::get<1>(*(column_strings_begin + idx)); + auto const field_begin = data.begin() + field_offset; if (cudf::detail::serialized_trie_contains( options.trie_na, {field_begin, static_cast(field_len)})) { atomicAdd(&column_info->null_count, 1); diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index 17925c33b9f..673e389b4ad 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -23,6 +23,8 @@ #include #include +#include + #include #include #include @@ -49,12 +51,15 @@ TEST_F(TypeInference, Basic) d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); std::size_t constexpr size = 3; - rmm::device_uvector> d_col_strings{size, stream}; - d_col_strings.set_element(0, {1, 2}, stream); - d_col_strings.set_element(1, {4, 2}, stream); - d_col_strings.set_element(2, {7, 1}, stream); + auto const string_offset = std::vector{1, 4, 7}; + auto const string_length = std::vector{2, 2, 1}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - auto res_type = detect_data_type(options.view(), d_data, d_col_strings.begin(), size, stream); + auto res_type = detect_data_type(options.view(), d_data, d_col_strings, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT64}); } From 640eb0059af566e60297487754c8ac4d46a3fe19 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 19 Aug 2022 11:30:06 -0400 Subject: [PATCH 073/173] Minor updates --- cpp/src/io/utilities/type_inference.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index bf7058c7435..dc220f5cf96 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -48,7 +48,7 @@ struct inference_options { cudf::detail::optional_trie trie_true; cudf::detail::optional_trie trie_false; cudf::detail::optional_trie trie_na; - char quote_char; + char quote_char = '"'; [[nodiscard]] inference_options_view view() const { @@ -116,7 +116,6 @@ __global__ void detect_column_type_kernel(inference_options_view const options, auto idx = threadIdx.x + blockDim.x * blockIdx.x; while (idx < size) { - // auto const [field_offset, field_len] = *(column_strings_begin + idx); auto const field_offset = thrust::get<0>(*(column_strings_begin + idx)); auto const field_len = thrust::get<1>(*(column_strings_begin + idx)); auto const field_begin = data.begin() + field_offset; From b0fac83ee114dcf8031c09b419c031b68d2b9092 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 19 Aug 2022 12:11:52 -0400 Subject: [PATCH 074/173] Add missing header --- cpp/tests/io/type_inference_test.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index 673e389b4ad..3afe0556da6 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -23,6 +23,7 @@ #include #include +#include #include #include From 67fcaf5c371df979357823528032c99fb000e529 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 19 Aug 2022 12:23:28 -0400 Subject: [PATCH 075/173] Fix the infinite loop bug with while --- cpp/src/io/utilities/type_inference.cuh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index dc220f5cf96..5beec822566 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -113,12 +113,12 @@ __global__ void detect_column_type_kernel(inference_options_view const options, std::size_t const size, cudf::io::column_type_histogram* column_info) { - auto idx = threadIdx.x + blockDim.x * blockIdx.x; - - while (idx < size) { + for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size; + idx += gridDim.x + blockDim.x) { auto const field_offset = thrust::get<0>(*(column_strings_begin + idx)); auto const field_len = thrust::get<1>(*(column_strings_begin + idx)); auto const field_begin = data.begin() + field_offset; + if (cudf::detail::serialized_trie_contains( options.trie_na, {field_begin, static_cast(field_len)})) { atomicAdd(&column_info->null_count, 1); @@ -196,9 +196,7 @@ __global__ void detect_column_type_kernel(inference_options_view const options, colon_count <= 2) { atomicAdd(&column_info->datetime_count, 1); } - - idx += gridDim.x + blockDim.x; - } // while + } // for } template From 51997be820196f8e1e7cbb740404c4f6a2cee36a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 20 Aug 2022 01:35:50 -0400 Subject: [PATCH 076/173] Update cpp/src/io/utilities/type_inference.cuh Co-authored-by: Elias Stehle <3958403+elstehle@users.noreply.github.com> --- cpp/src/io/utilities/type_inference.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 5beec822566..39932836edd 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -114,7 +114,7 @@ __global__ void detect_column_type_kernel(inference_options_view const options, cudf::io::column_type_histogram* column_info) { for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size; - idx += gridDim.x + blockDim.x) { + idx += gridDim.x * blockDim.x) { auto const field_offset = thrust::get<0>(*(column_strings_begin + idx)); auto const field_len = thrust::get<1>(*(column_strings_begin + idx)); auto const field_begin = data.begin() + field_offset; From a5e50d68e5bc7d7d62500c7ca377ddab80dec9e8 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 22 Aug 2022 00:32:11 -0700 Subject: [PATCH 077/173] patches data casting for escape handling --- cpp/src/io/json/data_casting.cuh | 313 +++++++++++++++++++++++++++---- 1 file changed, 276 insertions(+), 37 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index 437b3093af7..6a74f38de66 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,52 @@ namespace cudf::io::json::experimental { +constexpr char UNICODE_SEQ = 0x7F; +constexpr char NON_ESCAPE_CHAR = 0x7E; +__device__ __forceinline__ char get_escape_char(char escaped_char) +{ + switch (escaped_char) { + case '"': return 0x22; + case '\\': return 0x5C; + case '/': return 0x2F; + case 'b': return 0x08; + case 'f': return 0x0C; + case 'n': return 0x0A; + case 'r': return 0x0D; + case 't': return 0x09; + case 'u': return UNICODE_SEQ; + default: return NON_ESCAPE_CHAR; + } +} + +__device__ __forceinline__ int64_t string_to_hex(char const* str) +{ + // Unicode code point escape sequence comprises four hex characters + constexpr size_type unicode_hex_digits = 4; + + // Prepare result + int64_t result = 0, base = 1; + + // Iterate over hex digits right-to-left + size_type index = unicode_hex_digits; + while (index-- > 0) { + char const ch = str[index]; + if (ch >= '0' && ch <= '9') { + result += static_cast((ch - '0') + 0) * base; + base *= 16; + } else if (ch >= 'A' && ch <= 'F') { + result += static_cast((ch - 'A') + 10) * base; + base *= 16; + } else if (ch >= 'a' && ch <= 'f') { + result += static_cast((ch - 'a') + 10) * base; + base *= 16; + } else { + return -1; + } + } + return result; +} + template std::unique_ptr parse_data(str_tuple_it str_tuples, size_type col_size, @@ -39,48 +86,240 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, { if (col_type == cudf::data_type{cudf::type_id::STRING}) { rmm::device_uvector offsets(col_size + 1, stream); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, - sizes = device_span{offsets}, - null_mask = static_cast(null_mask.data()), - options] __device__(size_type row) { - if (not bit_is_set(null_mask, row)) { - sizes[row] = 0; - return; - } - auto const in = str_tuples[row]; - - auto const is_null_literal = serialized_trie_contains( - options.trie_na, {in.first, static_cast(in.second)}); - if (is_null_literal) { - sizes[row] = 0; - clear_bit(null_mask, row); - return; - } - - sizes[row] = in.second; - }); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, + sizes = device_span{offsets}, + null_mask = static_cast(null_mask.data()), + options] __device__(size_type row) { + if (not bit_is_set(null_mask, row)) { + sizes[row] = 0; + return; + } + auto const in = str_tuples[row]; + + auto const is_null_literal = + serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); + if (is_null_literal) { + sizes[row] = 0; + clear_bit(null_mask, row); + return; + } + + // Whether in the original JSON this was a string value enclosed in quotes + // ({"a":"foo"} vs. {"a":1.23}) + bool const is_string_value = + in.second >= 2 && (*in.first == '"') && (in.first[in.second - 1] == '"'); + + decltype(in.second) out_size = in.second; + + if (is_string_value) { + // Strip off quote chars + out_size = 0; + + // Check + bool escape = false; + + // Exclude quote chars from string range + auto end_index = in.second - 1; + for (decltype(in.second) i = 1; i < end_index; ++i) { + // Previous char was escape char + if (escape) { + // Reset escape flag for next loop iteration + escape = false; + + // Check the character that is supposed to be escaped + auto escaped_char = get_escape_char(in.first[i]); + + // This is an escape sequence of a unicode code point: \uXXXX, + // where each X in XXXX represents a hex digit + if (escaped_char == UNICODE_SEQ) { + // Make sure that there's at least 4 characters left from the + // input, which are expected to be hex digits + if (i + 4 < end_index) { + auto hex_val = string_to_hex(&in.first[i + 1]); + if (hex_val < 0) { + // TODO signal parsing error: not all 4 hex digits + // printf("PROBLEMa!\n"); + continue; + } + // Skip over the four hex digits + i += 4; + + // If this may be a UTF-16 encoded surrogate pair: + // we expect another \uXXXX sequence + if (i + 6 < end_index && in.first[i + 1] == '\\' && in.first[i + 2] == 'u') { + auto hex_low_val = string_to_hex(&in.first[i + 3]); + if (hex_val < 0xD800 && hex_low_val < 0xDC00) { + // TODO signal parsing error: not all 4 hex digits + // printf("PROBLEMb!\n"); + continue; + } + // Skip over the second \uXXXX sequence + i += 6; + uint32_t unicode_code_point = + 0x10000 + (hex_val - 0xD800) + (hex_low_val - 0xDC00); + auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); + out_size += strings::detail::bytes_in_char_utf8(utf8_chars); + } + // Just a single \uXXXX sequence + else { + auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); + out_size += strings::detail::bytes_in_char_utf8(utf8_chars); + } + } else { + // TODO signal parsing error: expected 4 hex digits + // printf("PROBLEMc!\n"); + } + } else if (escaped_char == NON_ESCAPE_CHAR) { + // TODO signal parsing error: this char does not need to be escape + // printf("PROBLEMd!\n"); + } else { + out_size++; + } + } else { + escape = in.first[i] == '\\'; + out_size += escape ? 0 : 1; + } + } + if (escape) { + // TODO signal parsing error: last char was escape, not followed by + // anything to escape + // printf("PROBLEMe!\n"); + } + } + + // Strip quotes if asked to do so + sizes[row] = out_size; + }); thrust::exclusive_scan( rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin()); rmm::device_uvector chars(offsets.back_element(stream), stream); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, - chars = device_span{chars}, - offsets = device_span{offsets}, - null_mask = static_cast(null_mask.data()), - options] __device__(size_type row) { - if (not bit_is_set(null_mask, row)) { return; } - auto const in = str_tuples[row]; - for (int i = 0; i < in.second; ++i) { - chars[offsets[row] + i] = *(in.first + i); - } - }); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, + chars = device_span{chars}, + offsets = device_span{offsets}, + null_mask = static_cast(null_mask.data()), + options] __device__(size_type row) { + if (not bit_is_set(null_mask, row)) { return; } + auto const in = str_tuples[row]; + // Whether in the original JSON this was a string value enclosed in quotes + // ({"a":"foo"} vs. {"a":1.23}) + bool const is_string_value = + in.second >= 2 && (*in.first == '"') && (in.first[in.second - 1] == '"'); + + // Copy string value with quote char and escape sequence handling + if (is_string_value) { + decltype(in.second) start_index = (is_string_value ? 1 : 0); + decltype(in.second) end_index = (is_string_value ? in.second - 1 : in.second); + // Check + bool escape = false; + for (int i = start_index, j = 0; i < end_index; ++i) { + // Previous char was escape char + if (escape) { + // Reset escape flag for next loop iteration + escape = false; + + // Check the character that is supposed to be escaped + auto escaped_char = get_escape_char(in.first[i]); + + // This is an escape sequence of a unicode code point: \uXXXX, + // where each X in XXXX represents a hex digit + if (escaped_char == UNICODE_SEQ) { + // printf("UNICODE!\n"); + // Make sure that there's at least 4 characters left from the + // input, which are expected to be hex digits + if (i + 4 < end_index) { + auto hex_val = string_to_hex(&in.first[i + 1]); + // printf("HEX 0x%08X '%c%c%c%c'!\n", + // static_cast(hex_val), + // in.first[i + 1 + 0], + // in.first[i + 1 + 1], + // in.first[i + 1 + 2], + // in.first[i + 1 + 3]); + if (hex_val < 0) { + // TODO signal parsing error: not all 4 hex digits + // printf("PROBLEM1 %lld '%c%c%c%c'!\n", + // hex_val, + // in.first[i + 1 + 0], + // in.first[i + 1 + 1], + // in.first[i + 1 + 2], + // in.first[i + 1 + 3]); + continue; + } + // Skip over the four hex digits + i += 4; + + // If this may be a UTF-16 encoded surrogate pair: + // we expect another \uXXXX sequence + if (i + 6 < end_index && in.first[i + 1] == '\\' && in.first[i + 2] == 'u') { + // printf("UNICODE UTF16!\n"); + auto hex_low_val = string_to_hex(&in.first[i + 3]); + // printf("HEX 0x%08X '%c%c%c%c'!\n", + // static_cast(hex_low_val), + // in.first[i + 3 + 0], + // in.first[i + 3 + 1], + // in.first[i + 3 + 2], + // in.first[i + 3 + 3]); + if (hex_val < 0xD800 && hex_low_val < 0xDC00) { + // TODO signal parsing error: not all 4 hex digits + // printf("PROBLEM2!\n"); + continue; + } + // Skip over the second \uXXXX sequence + i += 6; + uint32_t unicode_code_point = + 0x10000 + ((hex_val - 0xD800) << 10) + (hex_low_val - 0xDC00); + // printf("0x%08X\n", unicode_code_point); + auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); + j += strings::detail::from_char_utf8(utf8_chars, &chars[offsets[row] + j]); + } + // Just a single \uXXXX sequence + else { + // printf("0x%08X\n", static_cast(hex_val)); + auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); + j += strings::detail::from_char_utf8(utf8_chars, &chars[offsets[row] + j]); + } + } else { + // TODO signal parsing error: expected 4 hex digits + // printf("PROBLEM3!\n"); + } + } else if (escaped_char == NON_ESCAPE_CHAR) { + // TODO signal parsing error: this char does not need to be escape + // printf("PROBLE4M!\n"); + } else { + chars[offsets[row] + j] = escaped_char; + j++; + } + } else { + escape = in.first[i] == '\\'; + if (!escape) { + chars[offsets[row] + j] = *(in.first + i); + j++; + } + } + } + if (escape) { + // printf("PROBLEM5!\n"); + // TODO signal parsing error: last char was escape, not followed by + // anything to escape + } + } + // Copy literal/numeric value + else { + for (int i = 0, j = 0; i < in.second; ++i) { + chars[offsets[row] + j] = *(in.first + i); + j++; + } + } + }); return make_strings_column( col_size, std::move(offsets), std::move(chars), std::move(null_mask)); From fe70ac2b514569e340b0766354a8563c69622935 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 22 Aug 2022 00:45:35 -0700 Subject: [PATCH 078/173] resolves downstream inference conflicts --- cpp/src/io/json/nested_json_gpu.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 17f44a980d5..77e7f287226 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1421,7 +1421,7 @@ auto default_json_options() auto default_inference_options() { - fst::detail::inference_options parse_opts{}; + cudf::io::detail::inference_options parse_opts{}; auto const stream = rmm::cuda_stream_default; parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); @@ -1477,7 +1477,7 @@ std::pair, std::vector> json_column_to }); // Infer column type - auto target_type = fst::detail::detect_data_type( + auto target_type = cudf::io::detail::detect_data_type( default_inference_options().view(), d_input, string_ranges_it, col_size, stream); // Convert strings to the inferred data type From 779b638c0cb24d7e6eaed9a95fdc07e5fb615537 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 22 Aug 2022 14:04:37 -0700 Subject: [PATCH 079/173] removes debug prints from casting --- cpp/src/io/json/data_casting.cuh | 33 +------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index 6a74f38de66..cbdc31a1933 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -122,7 +122,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, // Check bool escape = false; - // Exclude quote chars from string range + // Exclude beginning and ending quote chars from string range auto end_index = in.second - 1; for (decltype(in.second) i = 1; i < end_index; ++i) { // Previous char was escape char @@ -142,7 +142,6 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, auto hex_val = string_to_hex(&in.first[i + 1]); if (hex_val < 0) { // TODO signal parsing error: not all 4 hex digits - // printf("PROBLEMa!\n"); continue; } // Skip over the four hex digits @@ -154,7 +153,6 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, auto hex_low_val = string_to_hex(&in.first[i + 3]); if (hex_val < 0xD800 && hex_low_val < 0xDC00) { // TODO signal parsing error: not all 4 hex digits - // printf("PROBLEMb!\n"); continue; } // Skip over the second \uXXXX sequence @@ -171,11 +169,9 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, } } else { // TODO signal parsing error: expected 4 hex digits - // printf("PROBLEMc!\n"); } } else if (escaped_char == NON_ESCAPE_CHAR) { // TODO signal parsing error: this char does not need to be escape - // printf("PROBLEMd!\n"); } else { out_size++; } @@ -187,7 +183,6 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, if (escape) { // TODO signal parsing error: last char was escape, not followed by // anything to escape - // printf("PROBLEMe!\n"); } } @@ -233,25 +228,12 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, // This is an escape sequence of a unicode code point: \uXXXX, // where each X in XXXX represents a hex digit if (escaped_char == UNICODE_SEQ) { - // printf("UNICODE!\n"); // Make sure that there's at least 4 characters left from the // input, which are expected to be hex digits if (i + 4 < end_index) { auto hex_val = string_to_hex(&in.first[i + 1]); - // printf("HEX 0x%08X '%c%c%c%c'!\n", - // static_cast(hex_val), - // in.first[i + 1 + 0], - // in.first[i + 1 + 1], - // in.first[i + 1 + 2], - // in.first[i + 1 + 3]); if (hex_val < 0) { // TODO signal parsing error: not all 4 hex digits - // printf("PROBLEM1 %lld '%c%c%c%c'!\n", - // hex_val, - // in.first[i + 1 + 0], - // in.first[i + 1 + 1], - // in.first[i + 1 + 2], - // in.first[i + 1 + 3]); continue; } // Skip over the four hex digits @@ -260,40 +242,28 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, // If this may be a UTF-16 encoded surrogate pair: // we expect another \uXXXX sequence if (i + 6 < end_index && in.first[i + 1] == '\\' && in.first[i + 2] == 'u') { - // printf("UNICODE UTF16!\n"); auto hex_low_val = string_to_hex(&in.first[i + 3]); - // printf("HEX 0x%08X '%c%c%c%c'!\n", - // static_cast(hex_low_val), - // in.first[i + 3 + 0], - // in.first[i + 3 + 1], - // in.first[i + 3 + 2], - // in.first[i + 3 + 3]); if (hex_val < 0xD800 && hex_low_val < 0xDC00) { // TODO signal parsing error: not all 4 hex digits - // printf("PROBLEM2!\n"); continue; } // Skip over the second \uXXXX sequence i += 6; uint32_t unicode_code_point = 0x10000 + ((hex_val - 0xD800) << 10) + (hex_low_val - 0xDC00); - // printf("0x%08X\n", unicode_code_point); auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); j += strings::detail::from_char_utf8(utf8_chars, &chars[offsets[row] + j]); } // Just a single \uXXXX sequence else { - // printf("0x%08X\n", static_cast(hex_val)); auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); j += strings::detail::from_char_utf8(utf8_chars, &chars[offsets[row] + j]); } } else { // TODO signal parsing error: expected 4 hex digits - // printf("PROBLEM3!\n"); } } else if (escaped_char == NON_ESCAPE_CHAR) { // TODO signal parsing error: this char does not need to be escape - // printf("PROBLE4M!\n"); } else { chars[offsets[row] + j] = escaped_char; j++; @@ -307,7 +277,6 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, } } if (escape) { - // printf("PROBLEM5!\n"); // TODO signal parsing error: last char was escape, not followed by // anything to escape } From 05b506f09de7cc16f06390cb2e8e497dfc18a9d3 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 22 Aug 2022 14:16:43 -0700 Subject: [PATCH 080/173] removes local test --- cpp/tests/io/json_test.cpp | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 4afa9c094c4..154b25bf7b9 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -969,27 +969,4 @@ TEST_F(JsonReaderTest, JsonExperimentalLines) CUDF_TEST_EXPECT_TABLES_EQUAL(current_reader_table.tbl->view(), new_reader_table.tbl->view()); } -TEST_F(JsonReaderTest, JsonExperimentalLarge) -{ - // std::string json_path = "/raid/estehle/rapids/cudf/large.json"; - std::string json_path = "/raid/estehle/rapids/cudf/large32x.json"; - - // Initialize parsing options (reading json lines) - cudf::io::json_reader_options json_lines_options = - cudf::io::json_reader_options::builder(cudf::io::source_info{json_path}).lines(true); - - // Read test data via existing, non-nested json lines reader - cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); - - // Read test data via new, nested json reader - json_lines_options.enable_experimental(true); - cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); - - // Verify that the data read via parquet matches the data read via JSON - CUDF_TEST_EXPECT_TABLES_EQUAL(current_reader_table.tbl->view(), new_reader_table.tbl->view()); - - // TODO enable once existing JSON lines reader's schema generation has been adjusted - // cudf::test::expect_metadata_equal(current_reader_table.metadata, new_reader_table.metadata); -} - CUDF_TEST_PROGRAM_MAIN() From bcf4b86e52c7bf8ab3ae0e3f1cea8875378070b1 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 23 Aug 2022 01:06:07 -0700 Subject: [PATCH 081/173] adds new logic for inferring nested columns --- cpp/src/io/json/nested_json.hpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index cf8971ee588..b329a66b131 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -200,7 +200,21 @@ struct json_column { uint32_t child_count) { // If, thus far, the column's type couldn't be inferred, we infer it to the given type - if (type == json_col_t::Unknown) { type = row_type; } + if (type == json_col_t::Unknown) { + type = row_type; + } + // If, at some point within a column, we encounter a nested type (list or struct), + // we change that columns type to that respective nested type and invalidate all previous rows + else if (type == json_col_t::StringColumn && + (row_type == json_col_t::ListColumn || row_type == json_col_t::StructColumn)) { + // Change the column type + type = row_type; + + // Invalidate all previous entries, as they were _not_ of the nested type to which we just + // converted + std::fill_n(validity.begin(), validity.size(), 0); + valid_count = 0U; + } // We shouldn't run into this, as we shouldn't be asked to append an "unknown" row type // CUDF_EXPECTS(type != json_col_t::Unknown, "Encountered invalid JSON token sequence"); @@ -217,13 +231,13 @@ struct json_column { // Struct | List => FAIL // Struct | Struct => valid // Struct | String => null - // String | List => null - // String | Struct => null + // String | List => valid (we switch col type to list, null'ing all previous rows) + // String | Struct => valid (we switch col type to list, null'ing all previous rows) // String | String => valid bool const is_valid = (type == row_type); if (static_cast(validity.size()) < word_index(current_offset)) validity.push_back({}); - set_bit_unsafe(&validity.back(), intra_word_index(current_offset)); + if (is_valid) { set_bit_unsafe(&validity.back(), intra_word_index(current_offset)); } valid_count += (is_valid) ? 1U : 0U; string_offsets.push_back(string_offset); string_lengths.push_back(string_end - string_offset); From ff87f3bc8b8fdf579cb8df2c10ffe45aab4bf88b Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 23 Aug 2022 01:06:39 -0700 Subject: [PATCH 082/173] fixes issue for two subsequent non-UTF-16 unicode esc sequences --- cpp/src/io/json/data_casting.cuh | 212 ++++++++++++++++--------------- 1 file changed, 107 insertions(+), 105 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index cbdc31a1933..4594942a22d 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -86,109 +86,110 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, { if (col_type == cudf::data_type{cudf::type_id::STRING}) { rmm::device_uvector offsets(col_size + 1, stream); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, - sizes = device_span{offsets}, - null_mask = static_cast(null_mask.data()), - options] __device__(size_type row) { - if (not bit_is_set(null_mask, row)) { - sizes[row] = 0; - return; - } - auto const in = str_tuples[row]; - - auto const is_null_literal = - serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); - if (is_null_literal) { - sizes[row] = 0; - clear_bit(null_mask, row); - return; - } - - // Whether in the original JSON this was a string value enclosed in quotes - // ({"a":"foo"} vs. {"a":1.23}) - bool const is_string_value = - in.second >= 2 && (*in.first == '"') && (in.first[in.second - 1] == '"'); - - decltype(in.second) out_size = in.second; - - if (is_string_value) { - // Strip off quote chars - out_size = 0; - - // Check - bool escape = false; - - // Exclude beginning and ending quote chars from string range - auto end_index = in.second - 1; - for (decltype(in.second) i = 1; i < end_index; ++i) { - // Previous char was escape char - if (escape) { - // Reset escape flag for next loop iteration - escape = false; - - // Check the character that is supposed to be escaped - auto escaped_char = get_escape_char(in.first[i]); - - // This is an escape sequence of a unicode code point: \uXXXX, - // where each X in XXXX represents a hex digit - if (escaped_char == UNICODE_SEQ) { - // Make sure that there's at least 4 characters left from the - // input, which are expected to be hex digits - if (i + 4 < end_index) { - auto hex_val = string_to_hex(&in.first[i + 1]); - if (hex_val < 0) { - // TODO signal parsing error: not all 4 hex digits - continue; - } - // Skip over the four hex digits - i += 4; - - // If this may be a UTF-16 encoded surrogate pair: - // we expect another \uXXXX sequence - if (i + 6 < end_index && in.first[i + 1] == '\\' && in.first[i + 2] == 'u') { - auto hex_low_val = string_to_hex(&in.first[i + 3]); - if (hex_val < 0xD800 && hex_low_val < 0xDC00) { - // TODO signal parsing error: not all 4 hex digits - continue; - } - // Skip over the second \uXXXX sequence - i += 6; - uint32_t unicode_code_point = - 0x10000 + (hex_val - 0xD800) + (hex_low_val - 0xDC00); - auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); - out_size += strings::detail::bytes_in_char_utf8(utf8_chars); - } - // Just a single \uXXXX sequence - else { - auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); - out_size += strings::detail::bytes_in_char_utf8(utf8_chars); - } - } else { - // TODO signal parsing error: expected 4 hex digits - } - } else if (escaped_char == NON_ESCAPE_CHAR) { - // TODO signal parsing error: this char does not need to be escape - } else { - out_size++; - } - } else { - escape = in.first[i] == '\\'; - out_size += escape ? 0 : 1; - } - } - if (escape) { - // TODO signal parsing error: last char was escape, not followed by - // anything to escape - } - } - - // Strip quotes if asked to do so - sizes[row] = out_size; - }); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, + sizes = device_span{offsets}, + null_mask = static_cast(null_mask.data()), + options] __device__(size_type row) { + if (not bit_is_set(null_mask, row)) { + sizes[row] = 0; + return; + } + auto const in = str_tuples[row]; + + auto const is_null_literal = serialized_trie_contains( + options.trie_na, {in.first, static_cast(in.second)}); + if (is_null_literal) { + sizes[row] = 0; + clear_bit(null_mask, row); + return; + } + + // Whether in the original JSON this was a string value enclosed in quotes + // ({"a":"foo"} vs. {"a":1.23}) + bool const is_string_value = + in.second >= 2 && (*in.first == '"') && (in.first[in.second - 1] == '"'); + + decltype(in.second) out_size = in.second; + + if (is_string_value) { + // Strip off quote chars + out_size = 0; + + // Check + bool escape = false; + + // Exclude beginning and ending quote chars from string range + auto end_index = in.second - 1; + for (decltype(in.second) i = 1; i < end_index; ++i) { + // Previous char was escape char + if (escape) { + // Reset escape flag for next loop iteration + escape = false; + + // Check the character that is supposed to be escaped + auto escaped_char = get_escape_char(in.first[i]); + + // This is an escape sequence of a unicode code point: \uXXXX, + // where each X in XXXX represents a hex digit + if (escaped_char == UNICODE_SEQ) { + // Make sure that there's at least 4 characters left from the + // input, which are expected to be hex digits + if (i + 4 < end_index) { + auto hex_val = string_to_hex(&in.first[i + 1]); + if (hex_val < 0) { + // TODO signal parsing error: not all 4 hex digits + continue; + } + // Skip over the four hex digits + i += 4; + + // If this may be a UTF-16 encoded surrogate pair: + // we expect another \uXXXX sequence + if (i + 6 < end_index && in.first[i + 1] == '\\' && + in.first[i + 2] == 'u' && hex_val >= 0xD800) { + auto hex_low_val = string_to_hex(&in.first[i + 3]); + if (hex_val < 0xD800 || hex_low_val < 0xDC00) { + // TODO signal parsing error: not all 4 hex digits + continue; + } + // Skip over the second \uXXXX sequence + i += 6; + uint32_t unicode_code_point = + 0x10000 + (hex_val - 0xD800) + (hex_low_val - 0xDC00); + auto utf8_chars = + strings::detail::codepoint_to_utf8(unicode_code_point); + out_size += strings::detail::bytes_in_char_utf8(utf8_chars); + } + // Just a single \uXXXX sequence + else { + auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); + out_size += strings::detail::bytes_in_char_utf8(utf8_chars); + } + } else { + // TODO signal parsing error: expected 4 hex digits + } + } else if (escaped_char == NON_ESCAPE_CHAR) { + // TODO signal parsing error: this char does not need to be escape + } else { + out_size++; + } + } else { + escape = in.first[i] == '\\'; + out_size += escape ? 0 : 1; + } + } + if (escape) { + // TODO signal parsing error: last char was escape, not followed by + // anything to escape + } + } + + // Strip quotes if asked to do so + sizes[row] = out_size; + }); thrust::exclusive_scan( rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin()); @@ -241,9 +242,10 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, // If this may be a UTF-16 encoded surrogate pair: // we expect another \uXXXX sequence - if (i + 6 < end_index && in.first[i + 1] == '\\' && in.first[i + 2] == 'u') { + if (i + 6 < end_index && in.first[i + 1] == '\\' && in.first[i + 2] == 'u' && + hex_val >= 0xD800) { auto hex_low_val = string_to_hex(&in.first[i + 3]); - if (hex_val < 0xD800 && hex_low_val < 0xDC00) { + if (hex_val < 0xD800 || hex_low_val < 0xDC00) { // TODO signal parsing error: not all 4 hex digits continue; } From e2fae0221f04ad4781156b6abf75b6c8cfb86d4e Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 26 Aug 2022 00:03:45 -0700 Subject: [PATCH 083/173] resolves merge conflicts --- cpp/tests/io/nested_json_test.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 928635b29da..f0ececaf4eb 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -248,9 +248,6 @@ TEST_F(JsonTest, TokenStream) // Default parsing options cudf::io::json_reader_options default_options{}; - // Default parsing options - cudf::io::json_reader_options default_options{}; - // Test input std::string const input = R"( [{)" R"("category": "reference",)" @@ -365,9 +362,6 @@ TEST_F(JsonTest, UTF_JSON) // Default parsing options cudf::io::json_reader_options default_options{}; - // Default parsing options - cudf::io::json_reader_options default_options{}; - // Only ASCII string std::string const ascii_pass = R"([ {"a":1,"b":2,"c":[3], "d": {}}, @@ -414,9 +408,6 @@ TEST_F(JsonTest, FromParquet) // Default parsing options cudf::io::json_reader_options default_options{}; - // Default parsing options - cudf::io::json_reader_options default_options{}; - // Binary parquet data containing the same data as the data represented by the JSON string. // We could add a dataset to include this file, but we don't want tests in cudf to have data. const unsigned char parquet_data[] = { From 872c33218e9a71f1e11dd0f2b111de92674aedaa Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 26 Aug 2022 00:04:35 -0700 Subject: [PATCH 084/173] fixes nullable behaviour to match nested json reader --- cpp/src/io/json/nested_json_gpu.cu | 5 ++++- cpp/tests/io/nested_json_test.cpp | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 822b7d0dfaa..99ffca27d2a 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1444,7 +1444,7 @@ auto default_inference_options() auto const stream = rmm::cuda_stream_default; parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); - parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + parse_opts.trie_na = cudf::detail::create_serialized_trie({"null"}, stream); return parse_opts; } @@ -1507,6 +1507,9 @@ std::pair, std::vector> json_column_to stream, mr); + // Reset nullable if we do not have nulls + if (col->null_count() == 0) { col->set_null_mask({}); } + return {std::move(col), {{"offsets"}, {"chars"}}}; break; } diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index f0ececaf4eb..c5def5e4edd 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -346,8 +346,8 @@ TEST_F(JsonTest, ExtractColumn) auto const second_column_index = 1; EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); - auto expected_col1 = cudf::test::strings_column_wrapper({"0.0", "0.1", "0.2"}); - auto expected_col2 = cudf::test::strings_column_wrapper({"1.0", "1.1", "1.2"}); + auto expected_col1 = cudf::test::fixed_width_column_wrapper({0.0, 0.1, 0.2}); + auto expected_col2 = cudf::test::fixed_width_column_wrapper({1.0, 1.1, 1.2}); cudf::column_view parsed_col1 = cudf_table.tbl->get_column(first_column_index); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); cudf::column_view parsed_col2 = cudf_table.tbl->get_column(second_column_index); From f12311874d9368b0308669f8b278d317534596a1 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 26 Aug 2022 02:26:30 -0700 Subject: [PATCH 085/173] migrates test to pytest --- cpp/tests/io/nested_json_test.cpp | 108 ---------------------------- python/cudf/cudf/tests/test_json.py | 11 +++ 2 files changed, 11 insertions(+), 108 deletions(-) diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index c5def5e4edd..b8c3a97054c 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -394,111 +394,3 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":NaN,"c":[null, null], "d": {"year": 2, "author": "filip Ê’akotÉ›"}}])"; CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_pass, default_options, stream)); } - -TEST_F(JsonTest, FromParquet) -{ - using cuio_json::SymbolT; - - std::string const input = - R"([{"0":{},"1":[],"2":{}},{"1":[[""],[]],"2":{"2":""}},{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}])"; - - // Prepare cuda stream for data transfers & kernels - constexpr auto stream = cudf::default_stream_value; - - // Default parsing options - cudf::io::json_reader_options default_options{}; - - // Binary parquet data containing the same data as the data represented by the JSON string. - // We could add a dataset to include this file, but we don't want tests in cudf to have data. - const unsigned char parquet_data[] = { - 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x18, 0x15, 0x18, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, - 0x06, 0x15, 0x06, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x21, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x31, 0x15, 0x00, 0x15, 0x24, 0x15, 0x20, 0x2C, 0x15, 0x08, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, - 0x00, 0x00, 0x12, 0x18, 0x03, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x05, 0x07, 0x04, 0x2D, 0x00, - 0x01, 0x01, 0x15, 0x00, 0x15, 0x22, 0x15, 0x22, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, 0x06, 0x15, - 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x04, 0x07, 0x00, 0x00, 0x00, 0x57, 0x26, 0x52, - 0x52, 0x3D, 0x2B, 0x49, 0x15, 0x00, 0x15, 0x14, 0x15, 0x14, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, - 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x04, 0x00, 0x00, 0x00, 0x00, 0x15, - 0x00, 0x15, 0x14, 0x15, 0x14, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x03, 0x02, 0x00, 0x00, 0x00, 0x00, 0x15, 0x02, 0x19, 0xCC, 0x48, 0x06, - 0x73, 0x63, 0x68, 0x65, 0x6D, 0x61, 0x15, 0x06, 0x00, 0x35, 0x02, 0x18, 0x01, 0x30, 0x15, 0x02, - 0x00, 0x15, 0x0C, 0x25, 0x02, 0x18, 0x01, 0x61, 0x25, 0x00, 0x00, 0x35, 0x02, 0x18, 0x01, 0x31, - 0x15, 0x02, 0x15, 0x06, 0x00, 0x35, 0x04, 0x18, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x15, 0x02, 0x00, - 0x35, 0x00, 0x18, 0x07, 0x65, 0x6C, 0x65, 0x6D, 0x65, 0x6E, 0x74, 0x15, 0x02, 0x15, 0x06, 0x00, - 0x35, 0x04, 0x18, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x15, 0x02, 0x00, 0x15, 0x0C, 0x25, 0x00, 0x18, - 0x07, 0x65, 0x6C, 0x65, 0x6D, 0x65, 0x6E, 0x74, 0x25, 0x00, 0x00, 0x35, 0x00, 0x18, 0x01, 0x32, - 0x15, 0x06, 0x00, 0x15, 0x0C, 0x25, 0x02, 0x18, 0x01, 0x30, 0x25, 0x00, 0x00, 0x15, 0x0C, 0x25, - 0x02, 0x18, 0x01, 0x31, 0x25, 0x00, 0x00, 0x15, 0x0C, 0x25, 0x02, 0x18, 0x01, 0x32, 0x25, 0x00, - 0x00, 0x16, 0x06, 0x19, 0x1C, 0x19, 0x5C, 0x26, 0x00, 0x1C, 0x15, 0x0C, 0x19, 0x25, 0x00, 0x06, - 0x19, 0x28, 0x01, 0x30, 0x01, 0x61, 0x15, 0x00, 0x16, 0x06, 0x16, 0x3A, 0x16, 0x3A, 0x26, 0x08, - 0x3C, 0x36, 0x04, 0x28, 0x01, 0x31, 0x18, 0x01, 0x31, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, 0x15, - 0x0C, 0x19, 0x25, 0x00, 0x06, 0x19, 0x58, 0x01, 0x31, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x07, 0x65, - 0x6C, 0x65, 0x6D, 0x65, 0x6E, 0x74, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x07, 0x65, 0x6C, 0x65, 0x6D, - 0x65, 0x6E, 0x74, 0x15, 0x02, 0x16, 0x08, 0x16, 0x46, 0x16, 0x42, 0x26, 0x42, 0x3C, 0x36, 0x00, - 0x28, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, 0x15, 0x0C, 0x19, 0x25, 0x00, 0x06, - 0x19, 0x28, 0x01, 0x32, 0x01, 0x30, 0x15, 0x00, 0x16, 0x06, 0x16, 0x44, 0x16, 0x44, 0x26, 0x84, - 0x01, 0x3C, 0x36, 0x04, 0x28, 0x07, 0x57, 0x26, 0x52, 0x52, 0x3D, 0x2B, 0x49, 0x18, 0x07, 0x57, - 0x26, 0x52, 0x52, 0x3D, 0x2B, 0x49, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, 0x15, 0x0C, 0x19, 0x25, - 0x00, 0x06, 0x19, 0x28, 0x01, 0x32, 0x01, 0x31, 0x15, 0x00, 0x16, 0x06, 0x16, 0x36, 0x16, 0x36, - 0x26, 0xC8, 0x01, 0x3C, 0x36, 0x04, 0x28, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, - 0x15, 0x0C, 0x19, 0x25, 0x00, 0x06, 0x19, 0x28, 0x01, 0x32, 0x01, 0x32, 0x15, 0x00, 0x16, 0x06, - 0x16, 0x36, 0x16, 0x36, 0x26, 0xFE, 0x01, 0x3C, 0x36, 0x04, 0x28, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x00, 0x16, 0xAC, 0x02, 0x16, 0x06, 0x00, 0x19, 0x1C, 0x18, 0x06, 0x70, 0x61, 0x6E, 0x64, 0x61, - 0x73, 0x18, 0xFE, 0x04, 0x7B, 0x22, 0x69, 0x6E, 0x64, 0x65, 0x78, 0x5F, 0x63, 0x6F, 0x6C, 0x75, - 0x6D, 0x6E, 0x73, 0x22, 0x3A, 0x20, 0x5B, 0x7B, 0x22, 0x6B, 0x69, 0x6E, 0x64, 0x22, 0x3A, 0x20, - 0x22, 0x72, 0x61, 0x6E, 0x67, 0x65, 0x22, 0x2C, 0x20, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, - 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x2C, 0x20, 0x22, 0x73, 0x74, 0x61, 0x72, 0x74, 0x22, 0x3A, 0x20, - 0x30, 0x2C, 0x20, 0x22, 0x73, 0x74, 0x6F, 0x70, 0x22, 0x3A, 0x20, 0x33, 0x2C, 0x20, 0x22, 0x73, - 0x74, 0x65, 0x70, 0x22, 0x3A, 0x20, 0x31, 0x7D, 0x5D, 0x2C, 0x20, 0x22, 0x63, 0x6F, 0x6C, 0x75, - 0x6D, 0x6E, 0x5F, 0x69, 0x6E, 0x64, 0x65, 0x78, 0x65, 0x73, 0x22, 0x3A, 0x20, 0x5B, 0x7B, 0x22, - 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x2C, 0x20, 0x22, 0x66, 0x69, - 0x65, 0x6C, 0x64, 0x5F, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x2C, - 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, 0x73, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, - 0x22, 0x75, 0x6E, 0x69, 0x63, 0x6F, 0x64, 0x65, 0x22, 0x2C, 0x20, 0x22, 0x6E, 0x75, 0x6D, 0x70, - 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, 0x63, 0x74, - 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3A, 0x20, 0x7B, - 0x22, 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E, 0x67, 0x22, 0x3A, 0x20, 0x22, 0x55, 0x54, 0x46, - 0x2D, 0x38, 0x22, 0x7D, 0x7D, 0x5D, 0x2C, 0x20, 0x22, 0x63, 0x6F, 0x6C, 0x75, 0x6D, 0x6E, 0x73, - 0x22, 0x3A, 0x20, 0x5B, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x30, 0x22, - 0x2C, 0x20, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x5F, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, - 0x22, 0x30, 0x22, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, 0x73, 0x5F, 0x74, 0x79, 0x70, - 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6E, - 0x75, 0x6D, 0x70, 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, - 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, - 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x7D, 0x2C, 0x20, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, - 0x3A, 0x20, 0x22, 0x31, 0x22, 0x2C, 0x20, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x5F, 0x6E, 0x61, - 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x31, 0x22, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, - 0x73, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6C, 0x69, 0x73, 0x74, 0x5B, 0x6C, - 0x69, 0x73, 0x74, 0x5B, 0x75, 0x6E, 0x69, 0x63, 0x6F, 0x64, 0x65, 0x5D, 0x5D, 0x22, 0x2C, 0x20, - 0x22, 0x6E, 0x75, 0x6D, 0x70, 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, - 0x62, 0x6A, 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, - 0x61, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x7D, 0x2C, 0x20, 0x7B, 0x22, 0x6E, 0x61, 0x6D, - 0x65, 0x22, 0x3A, 0x20, 0x22, 0x32, 0x22, 0x2C, 0x20, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x5F, - 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x32, 0x22, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, - 0x64, 0x61, 0x73, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, - 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6E, 0x75, 0x6D, 0x70, 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, - 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, - 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x7D, 0x5D, 0x2C, - 0x20, 0x22, 0x63, 0x72, 0x65, 0x61, 0x74, 0x6F, 0x72, 0x22, 0x3A, 0x20, 0x7B, 0x22, 0x6C, 0x69, - 0x62, 0x72, 0x61, 0x72, 0x79, 0x22, 0x3A, 0x20, 0x22, 0x70, 0x79, 0x61, 0x72, 0x72, 0x6F, 0x77, - 0x22, 0x2C, 0x20, 0x22, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x22, 0x3A, 0x20, 0x22, 0x38, - 0x2E, 0x30, 0x2E, 0x31, 0x22, 0x7D, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, 0x73, 0x5F, - 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x22, 0x3A, 0x20, 0x22, 0x31, 0x2E, 0x34, 0x2E, 0x33, - 0x22, 0x7D, 0x00, 0x29, 0x5C, 0x1C, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x1C, 0x00, - 0x00, 0x1C, 0x00, 0x00, 0x00, 0x0B, 0x04, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - - // Read in the data via parquet reader - cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(parquet_data), sizeof(parquet_data)}); - auto result = cudf::io::read_parquet(read_opts); - - // Read in the data via the JSON parser - auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, default_options, stream); - - // Verify that the data read via parquet matches the data read via JSON - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf_table.tbl->view(), result.tbl->view()); - - // Verify that the schema read via parquet matches the schema read via JSON - cudf::test::expect_metadata_equal(cudf_table.metadata, result.metadata); -} diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index f3d9180d44d..0dfde356b62 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -618,3 +618,14 @@ def test_json_nested_lines(data): # In the second test-case: # Pandas omits "f1" in first row, so we have to enforce a common schema assert df.to_arrow().equals(pa.Table.from_pandas(pdf)) + + +def test_json_nested_data(): + json_str = R'[{"0":{},"1":[],"2":{}},{"1":[[""],[]],"2":{"2":""}},{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}]' + bytes = BytesIO() + df = cudf.read_json( + StringIO(json_str), engine="cudf_experimental", orient="records" + ) + pdf = pd.read_json(StringIO(json_str), orient="records") + + assert df.to_arrow().equals(pa.Table.from_pandas(pdf)) From b23311d68de9eb83448df1d20d32a002437ada80 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 26 Aug 2022 03:28:08 -0700 Subject: [PATCH 086/173] fixes pytest style --- python/cudf/cudf/tests/test_json.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 0dfde356b62..13908a0ed45 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -621,8 +621,10 @@ def test_json_nested_lines(data): def test_json_nested_data(): - json_str = R'[{"0":{},"1":[],"2":{}},{"1":[[""],[]],"2":{"2":""}},{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}]' - bytes = BytesIO() + json_str = ( + R'[{"0":{},"1":[],"2":{}},{"1":[[""],[]],"2":{"2":""}},' + R'{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}]' + ) df = cudf.read_json( StringIO(json_str), engine="cudf_experimental", orient="records" ) From d1949ca22e683a685e2978c53fecd0f234790842 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 26 Aug 2022 07:17:39 -0700 Subject: [PATCH 087/173] adds option to keep quotes for string values --- cpp/include/cudf/io/json.hpp | 30 +++ cpp/src/io/json/data_casting.cuh | 366 +++++++++++++++-------------- cpp/src/io/json/nested_json_gpu.cu | 9 +- cpp/tests/io/nested_json_test.cpp | 30 +++ 4 files changed, 257 insertions(+), 178 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 73724b99589..411aec8d1f2 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -83,6 +83,9 @@ class json_reader_options { // Whether to use the experimental reader bool _experimental = false; + // Whether to keep the quote characters of string values + bool _keep_quotes = false; + /** * @brief Constructor from source info. * @@ -203,6 +206,13 @@ class json_reader_options { */ bool is_enabled_experimental() const { return _experimental; } + /** + * @brief Whether the experimental reader should keep quotes of string values. + * + * @returns true the experimental reader should keep quotes, false otherwise + */ + bool is_keeping_quotes() const { return _keep_quotes; } + /** * @brief Set data types for columns to be read. * @@ -258,6 +268,14 @@ class json_reader_options { * @param val Boolean value to enable/disable the experimental reader */ void enable_experimental(bool val) { _experimental = val; } + + /** + * @brief Set whether the experimental reader should keep quotes of string values. + * + * @param val Boolean value whether to indicate whether the experimental reader should keep quotes + * of string values + */ + void keep_quotes(bool val) { _keep_quotes = val; } }; /** @@ -377,6 +395,18 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether the experimental reader should keep quotes of string values. + * + * @param val Boolean value whether to indicate whether the experimental reader should keep quotes + * of string values + */ + json_reader_options_builder& keep_quotes(bool val) + { + options._keep_quotes = val; + return *this; + } + /** * @brief move json_reader_options member once it's built. */ diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index 4594942a22d..179c2125cc7 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -86,110 +86,117 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, { if (col_type == cudf::data_type{cudf::type_id::STRING}) { rmm::device_uvector offsets(col_size + 1, stream); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, - sizes = device_span{offsets}, - null_mask = static_cast(null_mask.data()), - options] __device__(size_type row) { - if (not bit_is_set(null_mask, row)) { - sizes[row] = 0; - return; - } - auto const in = str_tuples[row]; - - auto const is_null_literal = serialized_trie_contains( - options.trie_na, {in.first, static_cast(in.second)}); - if (is_null_literal) { - sizes[row] = 0; - clear_bit(null_mask, row); - return; - } - - // Whether in the original JSON this was a string value enclosed in quotes - // ({"a":"foo"} vs. {"a":1.23}) - bool const is_string_value = - in.second >= 2 && (*in.first == '"') && (in.first[in.second - 1] == '"'); - - decltype(in.second) out_size = in.second; - - if (is_string_value) { - // Strip off quote chars - out_size = 0; - - // Check - bool escape = false; - - // Exclude beginning and ending quote chars from string range - auto end_index = in.second - 1; - for (decltype(in.second) i = 1; i < end_index; ++i) { - // Previous char was escape char - if (escape) { - // Reset escape flag for next loop iteration - escape = false; - - // Check the character that is supposed to be escaped - auto escaped_char = get_escape_char(in.first[i]); - - // This is an escape sequence of a unicode code point: \uXXXX, - // where each X in XXXX represents a hex digit - if (escaped_char == UNICODE_SEQ) { - // Make sure that there's at least 4 characters left from the - // input, which are expected to be hex digits - if (i + 4 < end_index) { - auto hex_val = string_to_hex(&in.first[i + 1]); - if (hex_val < 0) { - // TODO signal parsing error: not all 4 hex digits - continue; - } - // Skip over the four hex digits - i += 4; - - // If this may be a UTF-16 encoded surrogate pair: - // we expect another \uXXXX sequence - if (i + 6 < end_index && in.first[i + 1] == '\\' && - in.first[i + 2] == 'u' && hex_val >= 0xD800) { - auto hex_low_val = string_to_hex(&in.first[i + 3]); - if (hex_val < 0xD800 || hex_low_val < 0xDC00) { - // TODO signal parsing error: not all 4 hex digits - continue; - } - // Skip over the second \uXXXX sequence - i += 6; - uint32_t unicode_code_point = - 0x10000 + (hex_val - 0xD800) + (hex_low_val - 0xDC00); - auto utf8_chars = - strings::detail::codepoint_to_utf8(unicode_code_point); - out_size += strings::detail::bytes_in_char_utf8(utf8_chars); - } - // Just a single \uXXXX sequence - else { - auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); - out_size += strings::detail::bytes_in_char_utf8(utf8_chars); - } - } else { - // TODO signal parsing error: expected 4 hex digits - } - } else if (escaped_char == NON_ESCAPE_CHAR) { - // TODO signal parsing error: this char does not need to be escape - } else { - out_size++; - } - } else { - escape = in.first[i] == '\\'; - out_size += escape ? 0 : 1; - } - } - if (escape) { - // TODO signal parsing error: last char was escape, not followed by - // anything to escape - } - } - - // Strip quotes if asked to do so - sizes[row] = out_size; - }); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, + sizes = device_span{offsets}, + null_mask = static_cast(null_mask.data()), + options] __device__(size_type row) { + if (not bit_is_set(null_mask, row)) { + sizes[row] = 0; + return; + } + auto const in = str_tuples[row]; + + auto const is_null_literal = + serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); + if (is_null_literal) { + sizes[row] = 0; + clear_bit(null_mask, row); + return; + } + + // Whether in the original JSON this was a string value enclosed in quotes + // ({"a":"foo"} vs. {"a":1.23}) + char const quote_char = options.quotechar; + bool const is_string_value = + in.second >= 2 && (*in.first == quote_char) && (in.first[in.second - 1] == quote_char); + + // Handling non-string values + if (not is_string_value) { sizes[row] = in.second; } + + // Strip off quote chars + decltype(in.second) out_size = 0; + + // Escape-flag, set after encountering an escape character + bool escape = false; + + // Exclude beginning and ending quote chars from string range + auto start_index = options.keepquotes ? 0 : 1; + auto end_index = in.second - (options.keepquotes ? 0 : 1); + for (decltype(in.second) i = start_index; i < end_index; ++i) { + // Previous char was an escape char + if (escape) { + // A unicode code point escape sequence is \uXXXX + auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = 6; + // The escape sequence comprises four hex digits + auto constexpr NUM_UNICODE_ESC_HEX_DIGITS = 4; + // A name for the char following the current one + auto constexpr NEXT_CHAR = 1; + // A name for the char after the next char + auto constexpr NEXT_NEXT_CHAR = 2; + // Reset escape flag for next loop iteration + escape = false; + + // Check the character that is supposed to be escaped + auto escaped_char = get_escape_char(in.first[i]); + + // This is an escape sequence of a unicode code point: \uXXXX, + // where each X in XXXX represents a hex digit + if (escaped_char == UNICODE_SEQ) { + // Make sure that there's at least 4 characters left from the + // input, which are expected to be hex digits + if (i + NUM_UNICODE_ESC_HEX_DIGITS < end_index) { + auto hex_val = string_to_hex(&in.first[i + NEXT_CHAR]); + if (hex_val < 0) { + // TODO signal parsing error: not all 4 hex digits + continue; + } + // Skip over the four hex digits + i += NUM_UNICODE_ESC_HEX_DIGITS; + + // If this may be a UTF-16 encoded surrogate pair: + // we expect another \uXXXX sequence + if (hex_val >= 0xD800 && i + NUM_UNICODE_ESC_SEQ_CHARS < end_index && + in.first[i + NEXT_CHAR] == '\\' && in.first[i + NEXT_NEXT_CHAR] == 'u') { + auto hex_low_val = string_to_hex(&in.first[i + 3]); + if (hex_val < 0xD800 || hex_low_val < 0xDC00) { + // TODO signal parsing error: not all 4 hex digits + continue; + } + // Skip over the second \uXXXX sequence + i += NUM_UNICODE_ESC_SEQ_CHARS; + uint32_t unicode_code_point = + 0x10000 + (hex_val - 0xD800) + (hex_low_val - 0xDC00); + auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); + out_size += strings::detail::bytes_in_char_utf8(utf8_chars); + } + // Just a single \uXXXX sequence + else { + auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); + out_size += strings::detail::bytes_in_char_utf8(utf8_chars); + } + } else { + // TODO signal parsing error: expected 4 hex digits + } + } else if (escaped_char == NON_ESCAPE_CHAR) { + // TODO signal parsing error: this char does not need to be escape + } else { + out_size++; + } + } else { + escape = in.first[i] == '\\'; + out_size += escape ? 0 : 1; + } + } + if (escape) { + // TODO signal parsing error: last char was escape, not followed by + // anything to escape + } + sizes[row] = out_size; + }); thrust::exclusive_scan( rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin()); @@ -206,89 +213,100 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, options] __device__(size_type row) { if (not bit_is_set(null_mask, row)) { return; } auto const in = str_tuples[row]; + // Whether in the original JSON this was a string value enclosed in quotes // ({"a":"foo"} vs. {"a":1.23}) + char const quote_char = options.quotechar; bool const is_string_value = - in.second >= 2 && (*in.first == '"') && (in.first[in.second - 1] == '"'); - - // Copy string value with quote char and escape sequence handling - if (is_string_value) { - decltype(in.second) start_index = (is_string_value ? 1 : 0); - decltype(in.second) end_index = (is_string_value ? in.second - 1 : in.second); - // Check - bool escape = false; - for (int i = start_index, j = 0; i < end_index; ++i) { - // Previous char was escape char - if (escape) { - // Reset escape flag for next loop iteration - escape = false; - - // Check the character that is supposed to be escaped - auto escaped_char = get_escape_char(in.first[i]); - - // This is an escape sequence of a unicode code point: \uXXXX, - // where each X in XXXX represents a hex digit - if (escaped_char == UNICODE_SEQ) { - // Make sure that there's at least 4 characters left from the - // input, which are expected to be hex digits - if (i + 4 < end_index) { - auto hex_val = string_to_hex(&in.first[i + 1]); - if (hex_val < 0) { + in.second >= 2 && (*in.first == quote_char) && (in.first[in.second - 1] == quote_char); + + // Copy literal/numeric value + if (!is_string_value) { + for (int i = 0, j = 0; i < in.second; ++i) { + chars[offsets[row] + j] = *(in.first + i); + j++; + } + } + + // Escape-flag, set after encountering an escape character + bool escape = false; + + // Exclude beginning and ending quote chars from string range + auto start_index = options.keepquotes ? 0 : 1; + auto end_index = in.second - (options.keepquotes ? 0 : 1); + + for (int i = start_index, j = 0; i < end_index; ++i) { + // Previous char was escape char + if (escape) { + // A unicode code point escape sequence is \uXXXX + auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = 6; + // The escape sequence comprises four hex digits + auto constexpr NUM_UNICODE_ESC_HEX_DIGITS = 4; + // A name for the char following the current one + auto constexpr NEXT_CHAR = 1; + // A name for the char after the next char + auto constexpr NEXT_NEXT_CHAR = 2; + // Reset escape flag for next loop iteration + escape = false; + + // Check the character that is supposed to be escaped + auto escaped_char = get_escape_char(in.first[i]); + + // This is an escape sequence of a unicode code point: \uXXXX, + // where each X in XXXX represents a hex digit + if (escaped_char == UNICODE_SEQ) { + // Make sure that there's at least 4 characters left from the + // input, which are expected to be hex digits + if (i + NUM_UNICODE_ESC_HEX_DIGITS < end_index) { + auto hex_val = string_to_hex(&in.first[i + NEXT_CHAR]); + if (hex_val < 0) { + // TODO signal parsing error: not all 4 hex digits + continue; + } + // Skip over the four hex digits + i += NUM_UNICODE_ESC_HEX_DIGITS; + + // If this may be a UTF-16 encoded surrogate pair: + // we expect another \uXXXX sequence + if (hex_val >= 0xD800 && i + NUM_UNICODE_ESC_SEQ_CHARS < end_index && + in.first[i + NEXT_CHAR] == '\\' && in.first[i + NEXT_NEXT_CHAR] == 'u') { + auto hex_low_val = string_to_hex(&in.first[i + 3]); + if (hex_val < 0xD800 || hex_low_val < 0xDC00) { // TODO signal parsing error: not all 4 hex digits continue; } - // Skip over the four hex digits - i += 4; - - // If this may be a UTF-16 encoded surrogate pair: - // we expect another \uXXXX sequence - if (i + 6 < end_index && in.first[i + 1] == '\\' && in.first[i + 2] == 'u' && - hex_val >= 0xD800) { - auto hex_low_val = string_to_hex(&in.first[i + 3]); - if (hex_val < 0xD800 || hex_low_val < 0xDC00) { - // TODO signal parsing error: not all 4 hex digits - continue; - } - // Skip over the second \uXXXX sequence - i += 6; - uint32_t unicode_code_point = - 0x10000 + ((hex_val - 0xD800) << 10) + (hex_low_val - 0xDC00); - auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); - j += strings::detail::from_char_utf8(utf8_chars, &chars[offsets[row] + j]); - } - // Just a single \uXXXX sequence - else { - auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); - j += strings::detail::from_char_utf8(utf8_chars, &chars[offsets[row] + j]); - } - } else { - // TODO signal parsing error: expected 4 hex digits + // Skip over the second \uXXXX sequence + i += NUM_UNICODE_ESC_SEQ_CHARS; + uint32_t unicode_code_point = + 0x10000 + ((hex_val - 0xD800) << 10) + (hex_low_val - 0xDC00); + auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); + j += strings::detail::from_char_utf8(utf8_chars, &chars[offsets[row] + j]); + } + // Just a single \uXXXX sequence + else { + auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); + j += strings::detail::from_char_utf8(utf8_chars, &chars[offsets[row] + j]); } - } else if (escaped_char == NON_ESCAPE_CHAR) { - // TODO signal parsing error: this char does not need to be escape } else { - chars[offsets[row] + j] = escaped_char; - j++; + // TODO signal parsing error: expected 4 hex digits } + } else if (escaped_char == NON_ESCAPE_CHAR) { + // TODO signal parsing error: this char does not need to be escape } else { - escape = in.first[i] == '\\'; - if (!escape) { - chars[offsets[row] + j] = *(in.first + i); - j++; - } + chars[offsets[row] + j] = escaped_char; + j++; + } + } else { + escape = in.first[i] == '\\'; + if (!escape) { + chars[offsets[row] + j] = *(in.first + i); + j++; } - } - if (escape) { - // TODO signal parsing error: last char was escape, not followed by - // anything to escape } } - // Copy literal/numeric value - else { - for (int i = 0, j = 0; i < in.second; ++i) { - chars[offsets[row] + j] = *(in.first + i); - j++; - } + if (escape) { + // TODO signal parsing error: last char was escape, not followed by + // anything to escape } }); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 99ffca27d2a..0354b094a83 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1426,18 +1426,19 @@ void make_json_column(json_column& root_column, root_column.level_child_cols_recursively(root_column.current_offset); } -auto default_json_options() +auto casting_options(cudf::io::json_reader_options const& options) { auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; auto const stream = rmm::cuda_stream_default; + parse_opts.keepquotes = options.is_keeping_quotes(); parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); return parse_opts; } -auto default_inference_options() +auto inference_options(cudf::io::json_reader_options const& options) { cudf::io::detail::inference_options parse_opts{}; @@ -1496,14 +1497,14 @@ std::pair, std::vector> json_column_to // Infer column type auto target_type = cudf::io::detail::detect_data_type( - default_inference_options().view(), d_input, string_ranges_it, col_size, stream); + inference_options(options).view(), d_input, string_ranges_it, col_size, stream); // Convert strings to the inferred data type auto col = cudf::io::json::experimental::parse_data(string_spans_it, col_size, target_type, make_validity(json_col).first, - default_json_options().view(), + casting_options(options).view(), stream, mr); diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index b8c3a97054c..5f311610add 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -394,3 +394,33 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":NaN,"c":[null, null], "d": {"year": 2, "author": "filip Ê’akotÉ›"}}])"; CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_pass, default_options, stream)); } + +TEST_F(JsonTest, ExtractColumnWithQuotes) +{ + using cuio_json::SymbolT; + + // Prepare cuda stream for data transfers & kernels + constexpr auto stream = cudf::default_stream_value; + + // Default parsing options + cudf::io::json_reader_options options{}; + options.keep_quotes(true); + + std::string const input = R"( [{"a":"0.0", "b":1.0}, {"b":1.1}, {"b":2.1, "a":"2.0"}] )"; + // Get the JSON's tree representation + auto const cudf_table = cuio_json::detail::parse_nested_json( + cudf::host_span{input.data(), input.size()}, options, stream); + + auto constexpr expected_col_count = 2; + auto constexpr first_column_index = 0; + auto constexpr second_column_index = 1; + EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); + + auto expected_col1 = + cudf::test::strings_column_wrapper({R"("0.0")", R"()", R"("2.0")"}, {true, false, true}); + auto expected_col2 = cudf::test::fixed_width_column_wrapper({1.0, 1.1, 2.1}); + cudf::column_view parsed_col1 = cudf_table.tbl->get_column(first_column_index); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); + cudf::column_view parsed_col2 = cudf_table.tbl->get_column(second_column_index); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2); +} From 6c423510689508fb80b6fd94e99ced17b166d1af Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 26 Aug 2022 07:42:03 -0700 Subject: [PATCH 088/173] fixes doxygen --- cpp/include/cudf/io/json.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 411aec8d1f2..8427b4c5e43 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -272,7 +272,7 @@ class json_reader_options { /** * @brief Set whether the experimental reader should keep quotes of string values. * - * @param val Boolean value whether to indicate whether the experimental reader should keep quotes + * @param val Boolean value to indicate whether the experimental reader should keep quotes * of string values */ void keep_quotes(bool val) { _keep_quotes = val; } @@ -398,8 +398,9 @@ class json_reader_options_builder { /** * @brief Set whether the experimental reader should keep quotes of string values. * - * @param val Boolean value whether to indicate whether the experimental reader should keep quotes + * @param val Boolean value to indicate whether the experimental reader should keep quotes * of string values + * @return this for chaining */ json_reader_options_builder& keep_quotes(bool val) { From b48da1a5d84060ca1ecb6d1d9207e34a74070495 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 26 Aug 2022 23:08:50 -0700 Subject: [PATCH 089/173] parses to rows to null for failing value casting --- cpp/src/io/json/data_casting.cuh | 72 +++++++++++++++++--------------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index 179c2125cc7..180fdff1f3d 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -110,12 +110,15 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, // Whether in the original JSON this was a string value enclosed in quotes // ({"a":"foo"} vs. {"a":1.23}) - char const quote_char = options.quotechar; + char const quote_char = '"'; bool const is_string_value = in.second >= 2 && (*in.first == quote_char) && (in.first[in.second - 1] == quote_char); // Handling non-string values - if (not is_string_value) { sizes[row] = in.second; } + if (not is_string_value) { + sizes[row] = in.second; + return; + } // Strip off quote chars decltype(in.second) out_size = 0; @@ -151,21 +154,22 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, if (i + NUM_UNICODE_ESC_HEX_DIGITS < end_index) { auto hex_val = string_to_hex(&in.first[i + NEXT_CHAR]); if (hex_val < 0) { - // TODO signal parsing error: not all 4 hex digits - continue; + sizes[row] = 0; + clear_bit(null_mask, row); + return; } // Skip over the four hex digits i += NUM_UNICODE_ESC_HEX_DIGITS; // If this may be a UTF-16 encoded surrogate pair: // we expect another \uXXXX sequence - if (hex_val >= 0xD800 && i + NUM_UNICODE_ESC_SEQ_CHARS < end_index && - in.first[i + NEXT_CHAR] == '\\' && in.first[i + NEXT_NEXT_CHAR] == 'u') { - auto hex_low_val = string_to_hex(&in.first[i + 3]); - if (hex_val < 0xD800 || hex_low_val < 0xDC00) { - // TODO signal parsing error: not all 4 hex digits - continue; - } + int64_t hex_low_val = 0; + if (i + NUM_UNICODE_ESC_SEQ_CHARS < end_index && in.first[i + NEXT_CHAR] == '\\' && + in.first[i + NEXT_NEXT_CHAR] == 'u') { + hex_low_val = string_to_hex(&in.first[i + 3]); + } + // This is indeed a surrogate pair + if (hex_val >= 0xD800 && hex_low_val >= 0xDC00) { // Skip over the second \uXXXX sequence i += NUM_UNICODE_ESC_SEQ_CHARS; uint32_t unicode_code_point = @@ -179,10 +183,14 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, out_size += strings::detail::bytes_in_char_utf8(utf8_chars); } } else { - // TODO signal parsing error: expected 4 hex digits + sizes[row] = 0; + clear_bit(null_mask, row); + return; } } else if (escaped_char == NON_ESCAPE_CHAR) { - // TODO signal parsing error: this char does not need to be escape + sizes[row] = 0; + clear_bit(null_mask, row); + return; } else { out_size++; } @@ -192,8 +200,9 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, } } if (escape) { - // TODO signal parsing error: last char was escape, not followed by - // anything to escape + sizes[row] = 0; + clear_bit(null_mask, row); + return; } sizes[row] = out_size; }); @@ -216,16 +225,17 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, // Whether in the original JSON this was a string value enclosed in quotes // ({"a":"foo"} vs. {"a":1.23}) - char const quote_char = options.quotechar; + char const quote_char = '"'; bool const is_string_value = in.second >= 2 && (*in.first == quote_char) && (in.first[in.second - 1] == quote_char); // Copy literal/numeric value - if (!is_string_value) { + if (not is_string_value) { for (int i = 0, j = 0; i < in.second; ++i) { chars[offsets[row] + j] = *(in.first + i); j++; } + return; } // Escape-flag, set after encountering an escape character @@ -259,22 +269,19 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, // input, which are expected to be hex digits if (i + NUM_UNICODE_ESC_HEX_DIGITS < end_index) { auto hex_val = string_to_hex(&in.first[i + NEXT_CHAR]); - if (hex_val < 0) { - // TODO signal parsing error: not all 4 hex digits - continue; - } + if (hex_val < 0) { return; } // Skip over the four hex digits i += NUM_UNICODE_ESC_HEX_DIGITS; // If this may be a UTF-16 encoded surrogate pair: // we expect another \uXXXX sequence - if (hex_val >= 0xD800 && i + NUM_UNICODE_ESC_SEQ_CHARS < end_index && - in.first[i + NEXT_CHAR] == '\\' && in.first[i + NEXT_NEXT_CHAR] == 'u') { - auto hex_low_val = string_to_hex(&in.first[i + 3]); - if (hex_val < 0xD800 || hex_low_val < 0xDC00) { - // TODO signal parsing error: not all 4 hex digits - continue; - } + int64_t hex_low_val = 0; + if (i + NUM_UNICODE_ESC_SEQ_CHARS < end_index && in.first[i + NEXT_CHAR] == '\\' && + in.first[i + NEXT_NEXT_CHAR] == 'u') { + hex_low_val = string_to_hex(&in.first[i + 3]); + } + // This is indeed a surrogate pair + if (hex_val >= 0xD800 && hex_low_val >= 0xDC00) { // Skip over the second \uXXXX sequence i += NUM_UNICODE_ESC_SEQ_CHARS; uint32_t unicode_code_point = @@ -288,10 +295,10 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, j += strings::detail::from_char_utf8(utf8_chars, &chars[offsets[row] + j]); } } else { - // TODO signal parsing error: expected 4 hex digits + return; } } else if (escaped_char == NON_ESCAPE_CHAR) { - // TODO signal parsing error: this char does not need to be escape + return; } else { chars[offsets[row] + j] = escaped_char; j++; @@ -304,10 +311,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, } } } - if (escape) { - // TODO signal parsing error: last char was escape, not followed by - // anything to escape - } + if (escape) { return; } }); return make_strings_column( From 7fcfcbbd75a07b57b09733c3fce7ff169856382a Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Sat, 27 Aug 2022 02:46:56 -0700 Subject: [PATCH 090/173] adds test for escape sequences --- cpp/tests/io/nested_json_test.cpp | 33 +++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 5f311610add..8e9e9057e7c 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -424,3 +424,36 @@ TEST_F(JsonTest, ExtractColumnWithQuotes) cudf::column_view parsed_col2 = cudf_table.tbl->get_column(second_column_index); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2); } + +TEST_F(JsonTest, EscapeSequenceTests) +{ + using cuio_json::SymbolT; + + // Prepare cuda stream for data transfers & kernels + constexpr auto stream = cudf::default_stream_value; + + // Default parsing options + cudf::io::json_reader_options options{}; + options.keep_quotes(true); + + std::string const input = + R"( [{"a":"🚀", "b":"\uD83D\uDE80"},)" + R"( {"a":null, "b":"invalid char being escaped escape char\-"},)" + R"( {"a":null, "b":"too few hex digits \u12"},)" + R"( {"a":null, "b":"too few hex digits for surrogate pair \uD83D\uDE"},)" + R"( {"a":"\\", "b":"\u005C"},)" + R"( {"a":"âž©", "b":"\u27A9"}] )"; + + // Get the JSON's tree representation + auto const cudf_table = cuio_json::detail::parse_nested_json( + cudf::host_span{input.data(), input.size()}, options, stream); + + auto constexpr expected_col_count = 2; + auto constexpr first_column_index = 0; + auto constexpr second_column_index = 1; + EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); + + cudf::column_view parsed_col1 = cudf_table.tbl->get_column(first_column_index); + cudf::column_view parsed_col2 = cudf_table.tbl->get_column(second_column_index); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(parsed_col1, parsed_col2); +} From eceefe9169dfb3b7bca7df22974d44ff16485269 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Sat, 27 Aug 2022 02:56:15 -0700 Subject: [PATCH 091/173] adds string value handling --- cpp/src/io/json/data_casting.cuh | 306 +++++++++++++++++++++++++++---- 1 file changed, 269 insertions(+), 37 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index 437b3093af7..180fdff1f3d 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,52 @@ namespace cudf::io::json::experimental { +constexpr char UNICODE_SEQ = 0x7F; +constexpr char NON_ESCAPE_CHAR = 0x7E; +__device__ __forceinline__ char get_escape_char(char escaped_char) +{ + switch (escaped_char) { + case '"': return 0x22; + case '\\': return 0x5C; + case '/': return 0x2F; + case 'b': return 0x08; + case 'f': return 0x0C; + case 'n': return 0x0A; + case 'r': return 0x0D; + case 't': return 0x09; + case 'u': return UNICODE_SEQ; + default: return NON_ESCAPE_CHAR; + } +} + +__device__ __forceinline__ int64_t string_to_hex(char const* str) +{ + // Unicode code point escape sequence comprises four hex characters + constexpr size_type unicode_hex_digits = 4; + + // Prepare result + int64_t result = 0, base = 1; + + // Iterate over hex digits right-to-left + size_type index = unicode_hex_digits; + while (index-- > 0) { + char const ch = str[index]; + if (ch >= '0' && ch <= '9') { + result += static_cast((ch - '0') + 0) * base; + base *= 16; + } else if (ch >= 'A' && ch <= 'F') { + result += static_cast((ch - 'A') + 10) * base; + base *= 16; + } else if (ch >= 'a' && ch <= 'f') { + result += static_cast((ch - 'a') + 10) * base; + base *= 16; + } else { + return -1; + } + } + return result; +} + template std::unique_ptr parse_data(str_tuple_it str_tuples, size_type col_size, @@ -39,48 +86,233 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, { if (col_type == cudf::data_type{cudf::type_id::STRING}) { rmm::device_uvector offsets(col_size + 1, stream); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, - sizes = device_span{offsets}, - null_mask = static_cast(null_mask.data()), - options] __device__(size_type row) { - if (not bit_is_set(null_mask, row)) { - sizes[row] = 0; - return; - } - auto const in = str_tuples[row]; - - auto const is_null_literal = serialized_trie_contains( - options.trie_na, {in.first, static_cast(in.second)}); - if (is_null_literal) { - sizes[row] = 0; - clear_bit(null_mask, row); - return; - } - - sizes[row] = in.second; - }); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, + sizes = device_span{offsets}, + null_mask = static_cast(null_mask.data()), + options] __device__(size_type row) { + if (not bit_is_set(null_mask, row)) { + sizes[row] = 0; + return; + } + auto const in = str_tuples[row]; + + auto const is_null_literal = + serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); + if (is_null_literal) { + sizes[row] = 0; + clear_bit(null_mask, row); + return; + } + + // Whether in the original JSON this was a string value enclosed in quotes + // ({"a":"foo"} vs. {"a":1.23}) + char const quote_char = '"'; + bool const is_string_value = + in.second >= 2 && (*in.first == quote_char) && (in.first[in.second - 1] == quote_char); + + // Handling non-string values + if (not is_string_value) { + sizes[row] = in.second; + return; + } + + // Strip off quote chars + decltype(in.second) out_size = 0; + + // Escape-flag, set after encountering an escape character + bool escape = false; + + // Exclude beginning and ending quote chars from string range + auto start_index = options.keepquotes ? 0 : 1; + auto end_index = in.second - (options.keepquotes ? 0 : 1); + for (decltype(in.second) i = start_index; i < end_index; ++i) { + // Previous char was an escape char + if (escape) { + // A unicode code point escape sequence is \uXXXX + auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = 6; + // The escape sequence comprises four hex digits + auto constexpr NUM_UNICODE_ESC_HEX_DIGITS = 4; + // A name for the char following the current one + auto constexpr NEXT_CHAR = 1; + // A name for the char after the next char + auto constexpr NEXT_NEXT_CHAR = 2; + // Reset escape flag for next loop iteration + escape = false; + + // Check the character that is supposed to be escaped + auto escaped_char = get_escape_char(in.first[i]); + + // This is an escape sequence of a unicode code point: \uXXXX, + // where each X in XXXX represents a hex digit + if (escaped_char == UNICODE_SEQ) { + // Make sure that there's at least 4 characters left from the + // input, which are expected to be hex digits + if (i + NUM_UNICODE_ESC_HEX_DIGITS < end_index) { + auto hex_val = string_to_hex(&in.first[i + NEXT_CHAR]); + if (hex_val < 0) { + sizes[row] = 0; + clear_bit(null_mask, row); + return; + } + // Skip over the four hex digits + i += NUM_UNICODE_ESC_HEX_DIGITS; + + // If this may be a UTF-16 encoded surrogate pair: + // we expect another \uXXXX sequence + int64_t hex_low_val = 0; + if (i + NUM_UNICODE_ESC_SEQ_CHARS < end_index && in.first[i + NEXT_CHAR] == '\\' && + in.first[i + NEXT_NEXT_CHAR] == 'u') { + hex_low_val = string_to_hex(&in.first[i + 3]); + } + // This is indeed a surrogate pair + if (hex_val >= 0xD800 && hex_low_val >= 0xDC00) { + // Skip over the second \uXXXX sequence + i += NUM_UNICODE_ESC_SEQ_CHARS; + uint32_t unicode_code_point = + 0x10000 + (hex_val - 0xD800) + (hex_low_val - 0xDC00); + auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); + out_size += strings::detail::bytes_in_char_utf8(utf8_chars); + } + // Just a single \uXXXX sequence + else { + auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); + out_size += strings::detail::bytes_in_char_utf8(utf8_chars); + } + } else { + sizes[row] = 0; + clear_bit(null_mask, row); + return; + } + } else if (escaped_char == NON_ESCAPE_CHAR) { + sizes[row] = 0; + clear_bit(null_mask, row); + return; + } else { + out_size++; + } + } else { + escape = in.first[i] == '\\'; + out_size += escape ? 0 : 1; + } + } + if (escape) { + sizes[row] = 0; + clear_bit(null_mask, row); + return; + } + sizes[row] = out_size; + }); thrust::exclusive_scan( rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin()); rmm::device_uvector chars(offsets.back_element(stream), stream); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, - chars = device_span{chars}, - offsets = device_span{offsets}, - null_mask = static_cast(null_mask.data()), - options] __device__(size_type row) { - if (not bit_is_set(null_mask, row)) { return; } - auto const in = str_tuples[row]; - for (int i = 0; i < in.second; ++i) { - chars[offsets[row] + i] = *(in.first + i); - } - }); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, + chars = device_span{chars}, + offsets = device_span{offsets}, + null_mask = static_cast(null_mask.data()), + options] __device__(size_type row) { + if (not bit_is_set(null_mask, row)) { return; } + auto const in = str_tuples[row]; + + // Whether in the original JSON this was a string value enclosed in quotes + // ({"a":"foo"} vs. {"a":1.23}) + char const quote_char = '"'; + bool const is_string_value = + in.second >= 2 && (*in.first == quote_char) && (in.first[in.second - 1] == quote_char); + + // Copy literal/numeric value + if (not is_string_value) { + for (int i = 0, j = 0; i < in.second; ++i) { + chars[offsets[row] + j] = *(in.first + i); + j++; + } + return; + } + + // Escape-flag, set after encountering an escape character + bool escape = false; + + // Exclude beginning and ending quote chars from string range + auto start_index = options.keepquotes ? 0 : 1; + auto end_index = in.second - (options.keepquotes ? 0 : 1); + + for (int i = start_index, j = 0; i < end_index; ++i) { + // Previous char was escape char + if (escape) { + // A unicode code point escape sequence is \uXXXX + auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = 6; + // The escape sequence comprises four hex digits + auto constexpr NUM_UNICODE_ESC_HEX_DIGITS = 4; + // A name for the char following the current one + auto constexpr NEXT_CHAR = 1; + // A name for the char after the next char + auto constexpr NEXT_NEXT_CHAR = 2; + // Reset escape flag for next loop iteration + escape = false; + + // Check the character that is supposed to be escaped + auto escaped_char = get_escape_char(in.first[i]); + + // This is an escape sequence of a unicode code point: \uXXXX, + // where each X in XXXX represents a hex digit + if (escaped_char == UNICODE_SEQ) { + // Make sure that there's at least 4 characters left from the + // input, which are expected to be hex digits + if (i + NUM_UNICODE_ESC_HEX_DIGITS < end_index) { + auto hex_val = string_to_hex(&in.first[i + NEXT_CHAR]); + if (hex_val < 0) { return; } + // Skip over the four hex digits + i += NUM_UNICODE_ESC_HEX_DIGITS; + + // If this may be a UTF-16 encoded surrogate pair: + // we expect another \uXXXX sequence + int64_t hex_low_val = 0; + if (i + NUM_UNICODE_ESC_SEQ_CHARS < end_index && in.first[i + NEXT_CHAR] == '\\' && + in.first[i + NEXT_NEXT_CHAR] == 'u') { + hex_low_val = string_to_hex(&in.first[i + 3]); + } + // This is indeed a surrogate pair + if (hex_val >= 0xD800 && hex_low_val >= 0xDC00) { + // Skip over the second \uXXXX sequence + i += NUM_UNICODE_ESC_SEQ_CHARS; + uint32_t unicode_code_point = + 0x10000 + ((hex_val - 0xD800) << 10) + (hex_low_val - 0xDC00); + auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); + j += strings::detail::from_char_utf8(utf8_chars, &chars[offsets[row] + j]); + } + // Just a single \uXXXX sequence + else { + auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); + j += strings::detail::from_char_utf8(utf8_chars, &chars[offsets[row] + j]); + } + } else { + return; + } + } else if (escaped_char == NON_ESCAPE_CHAR) { + return; + } else { + chars[offsets[row] + j] = escaped_char; + j++; + } + } else { + escape = in.first[i] == '\\'; + if (!escape) { + chars[offsets[row] + j] = *(in.first + i); + j++; + } + } + } + if (escape) { return; } + }); return make_strings_column( col_size, std::move(offsets), std::move(chars), std::move(null_mask)); From 9d71c85b93165a249d137246a3211b450dfcc6f2 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 29 Aug 2022 10:35:01 -0700 Subject: [PATCH 092/173] adds NVTX range annotation --- cpp/src/io/json/nested_json_gpu.cu | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 0354b094a83..02d6f7cfc94 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -918,6 +918,11 @@ void get_stack_context(device_span json_in, SymbolT* d_top_of_stack, rmm::cuda_stream_view stream) { + // Range of encapsulating function that comprises: + // -> DFA simulation for filtering out brackets and braces inside of quotes + // -> Logical stack to infer the stack context + CUDF_FUNC_RANGE(); + constexpr std::size_t single_item = 1; // Symbol representing the JSON-root (i.e., we're at nesting level '0') @@ -971,6 +976,9 @@ std::pair, rmm::device_uvector> ge rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + // Range of encapsulating function that parses to internal columnar data representation + CUDF_FUNC_RANGE(); + rmm::device_uvector tokens{json_in.size(), stream, mr}; rmm::device_uvector tokens_indices{json_in.size(), stream, mr}; rmm::device_scalar num_written_tokens{stream, mr}; @@ -1049,6 +1057,9 @@ void make_json_column(json_column& root_column, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { + // Range of encapsulating function that parses to internal columnar data representation + CUDF_FUNC_RANGE(); + // Default name for a list's child column std::string const list_child_name = "element"; @@ -1456,6 +1467,9 @@ std::pair, std::vector> json_column_to rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + // Range of orchastrating/encapsulating function + CUDF_FUNC_RANGE(); + auto make_validity = [stream, mr](json_column const& json_col) -> std::pair { return {rmm::device_buffer{json_col.validity.data(), @@ -1572,6 +1586,9 @@ table_with_metadata parse_nested_json(host_span input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + // Range of orchastrating/encapsulating function + CUDF_FUNC_RANGE(); + auto const new_line_delimited_json = options.is_enabled_lines(); // Allocate device memory for the JSON input & copy over to device From 27643af7ec3260f1f78330ddcdbf260355e61259 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 29 Aug 2022 12:18:11 -0700 Subject: [PATCH 093/173] moves unicode escape tests to type conversion --- cpp/tests/io/nested_json_test.cpp | 33 ------------------------------- 1 file changed, 33 deletions(-) diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 8e9e9057e7c..5f311610add 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -424,36 +424,3 @@ TEST_F(JsonTest, ExtractColumnWithQuotes) cudf::column_view parsed_col2 = cudf_table.tbl->get_column(second_column_index); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2); } - -TEST_F(JsonTest, EscapeSequenceTests) -{ - using cuio_json::SymbolT; - - // Prepare cuda stream for data transfers & kernels - constexpr auto stream = cudf::default_stream_value; - - // Default parsing options - cudf::io::json_reader_options options{}; - options.keep_quotes(true); - - std::string const input = - R"( [{"a":"🚀", "b":"\uD83D\uDE80"},)" - R"( {"a":null, "b":"invalid char being escaped escape char\-"},)" - R"( {"a":null, "b":"too few hex digits \u12"},)" - R"( {"a":null, "b":"too few hex digits for surrogate pair \uD83D\uDE"},)" - R"( {"a":"\\", "b":"\u005C"},)" - R"( {"a":"âž©", "b":"\u27A9"}] )"; - - // Get the JSON's tree representation - auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, options, stream); - - auto constexpr expected_col_count = 2; - auto constexpr first_column_index = 0; - auto constexpr second_column_index = 1; - EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); - - cudf::column_view parsed_col1 = cudf_table.tbl->get_column(first_column_index); - cudf::column_view parsed_col2 = cudf_table.tbl->get_column(second_column_index); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(parsed_col1, parsed_col2); -} From 1778bdb11120c66f6ee013b44de3f0381f269416 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 29 Aug 2022 21:58:55 -0700 Subject: [PATCH 094/173] cudf default stream and escape seq tests --- cpp/tests/io/json_type_cast_test.cu | 40 ++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index dde4dcc8ba1..04061cfaf4d 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -51,7 +51,7 @@ auto default_json_options() { auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; - auto const stream = rmm::cuda_stream_default; + auto const stream = cudf::default_stream_value; parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); @@ -60,7 +60,7 @@ auto default_json_options() TEST_F(JSONTypeCastTest, String) { - auto const stream = rmm::cuda_stream_default; + auto const stream = cudf::default_stream_value; auto mr = rmm::mr::get_current_device_resource(); auto const type = cudf::data_type{cudf::type_id::STRING}; @@ -93,7 +93,7 @@ TEST_F(JSONTypeCastTest, String) TEST_F(JSONTypeCastTest, Int) { - auto const stream = rmm::cuda_stream_default; + auto const stream = cudf::default_stream_value; auto mr = rmm::mr::get_current_device_resource(); auto const type = cudf::data_type{cudf::type_id::INT64}; @@ -118,4 +118,38 @@ TEST_F(JSONTypeCastTest, Int) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected); } +TEST_F(JSONTypeCastTest, StringEscapes) +{ + auto const stream = cudf::default_stream_value; + auto mr = rmm::mr::get_current_device_resource(); + auto const type = cudf::data_type{cudf::type_id::STRING}; + + cudf::test::strings_column_wrapper data({ + R"("\uD83D\uDE80")", + R"("invalid char being escaped escape char\-")", + R"("too few hex digits \u12")", + R"("too few hex digits for surrogate pair \uD83D\uDE")", + R"("\u005C")", + R"("\u27A9")", + }); + auto d_column = cudf::column_device_view::create(data); + rmm::device_uvector> svs(d_column->size(), stream); + thrust::transform(thrust::device, + d_column->pair_begin(), + d_column->pair_end(), + svs.begin(), + to_thrust_pair_fn{}); + + auto null_mask_it = no_nulls(); + auto null_mask = + cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()); + + auto col = cudf::io::json::experimental::parse_data( + svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); + + auto expected = cudf::test::strings_column_wrapper{{"🚀", "", "", "", "\\", "âž©"}, + {true, false, false, false, true, true}}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected); +} + CUDF_TEST_PROGRAM_MAIN() From 69048f40c7e6370718abc53dd5636f262620fa94 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 30 Aug 2022 01:05:29 -0700 Subject: [PATCH 095/173] test case for null string versus null literal --- cpp/tests/io/json_type_cast_test.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 04061cfaf4d..dda39e31795 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -65,7 +65,7 @@ TEST_F(JSONTypeCastTest, String) auto const type = cudf::data_type{cudf::type_id::STRING}; auto in_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; }); - std::vector input_values{"this", "is", "null", "of", "", "strings"}; + std::vector input_values{"this", "is", "null", "of", "", "strings", R"("null")"}; cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end(), in_valids); auto d_column = cudf::column_device_view::create(input); @@ -85,7 +85,7 @@ TEST_F(JSONTypeCastTest, String) auto out_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; }); - std::vector expected_values{"this", "is", "", "of", "", "strings"}; + std::vector expected_values{"this", "is", "", "of", "", "strings", "null"}; cudf::test::strings_column_wrapper expected( expected_values.begin(), expected_values.end(), out_valids); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(str_col->view(), expected); From cc1a04cbcccedf69213ed4b9130ab37c2b018ae5 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 30 Aug 2022 09:25:58 -0400 Subject: [PATCH 096/173] Update cpp/src/io/utilities/type_inference.cuh Co-authored-by: Elias Stehle <3958403+elstehle@users.noreply.github.com> --- cpp/src/io/utilities/type_inference.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 39932836edd..db0f7ec7d09 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -60,7 +60,7 @@ struct inference_options { }; /** - * @brief Returns true is the input character is a valid digit. + * @brief Returns true if the input character is a valid digit. * Supports both decimal and hexadecimal digits (uppercase and lowercase). * * @param c Character to check From 756a3e29df343cec396cb76fb6e4a756f3a55a38 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 30 Aug 2022 09:26:15 -0400 Subject: [PATCH 097/173] Update cpp/src/io/utilities/type_inference.cuh Co-authored-by: Elias Stehle <3958403+elstehle@users.noreply.github.com> --- cpp/src/io/utilities/type_inference.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index db0f7ec7d09..7c78b36e4e1 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -132,7 +132,6 @@ __global__ void detect_column_type_kernel(inference_options_view const options, continue; } - // No need to check strings since it's inferred in the tree generation int digit_count = 0; int decimal_count = 0; int slash_count = 0; From 8961126e12f90f3f403d29b634437e5b6e7960ed Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 30 Aug 2022 09:34:04 -0400 Subject: [PATCH 098/173] Update cpp/src/io/utilities/type_inference.cuh Co-authored-by: Elias Stehle <3958403+elstehle@users.noreply.github.com> --- cpp/src/io/utilities/type_inference.cuh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 7c78b36e4e1..3954bc7b1f7 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -132,14 +132,14 @@ __global__ void detect_column_type_kernel(inference_options_view const options, continue; } - int digit_count = 0; - int decimal_count = 0; - int slash_count = 0; - int dash_count = 0; - int plus_count = 0; - int colon_count = 0; - int exponent_count = 0; - int other_count = 0; + uint32_t digit_count = 0; + uint32_t decimal_count = 0; + uint32_t slash_count = 0; + uint32_t dash_count = 0; + uint32_t plus_count = 0; + uint32_t colon_count = 0; + uint32_t exponent_count = 0; + uint32_t other_count = 0; auto const maybe_hex = (field_len > 2 && *field_begin == '0' && *(field_begin + 1) == 'x') || (field_len > 3 && *field_begin == '-' && *(field_begin + 1) == '0' && From bb69c14e7c9a775d305578bd16cd1aa5f18ec7bf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 30 Aug 2022 09:35:44 -0400 Subject: [PATCH 099/173] Minor doc updates --- cpp/src/io/utilities/type_inference.cuh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 5beec822566..8af6dd34222 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -35,7 +35,7 @@ namespace cudf { namespace io { namespace detail { /** - * @brief Structure for type inference options + * @brief Non-owning view for type inference options */ struct inference_options_view { cudf::detail::trie_view trie_true; @@ -43,7 +43,9 @@ struct inference_options_view { cudf::detail::trie_view trie_na; char quote_char; }; - +/** + * @brief Structure for type inference options + */ struct inference_options { cudf::detail::optional_trie trie_true; cudf::detail::optional_trie trie_false; @@ -60,7 +62,7 @@ struct inference_options { }; /** - * @brief Returns true is the input character is a valid digit. + * @brief Returns true if the input character is a valid digit. * Supports both decimal and hexadecimal digits (uppercase and lowercase). * * @param c Character to check @@ -168,7 +170,7 @@ __global__ void detect_column_type_kernel(inference_options_view const options, } // Integers have to have the length of the string - int int_req_number_cnt = field_len; + auto int_req_number_cnt = field_len; // Off by one if they start with a minus sign if ((*field_begin == '-' || *field_begin == '+') && field_len > 1) { --int_req_number_cnt; } // Off by one if they are a hexadecimal number From ec593badc110e08fb6e3566074e1539d58d70ee2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 30 Aug 2022 10:38:47 -0400 Subject: [PATCH 100/173] Update docs --- cpp/src/io/utilities/type_inference.cuh | 43 ++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 83f34b8e2d1..ed7ec8c304c 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -108,6 +108,18 @@ __device__ __inline__ bool is_like_float( return true; } +/** + * @brief Constructs column type histogram for a given column string input `data`. + * + * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to + * `thrust::tuple` + * + * @param[in] options View of inference options + * @param[in] data JSON string input + * @param[in] column_strings_begin The begining of an offset-length tuple sequence + * @param[in] size Size of the string input + * @param[out] column_info Histogram of column type counters + */ template __global__ void detect_column_type_kernel(inference_options_view const options, device_span const data, @@ -169,7 +181,7 @@ __global__ void detect_column_type_kernel(inference_options_view const options, } // Integers have to have the length of the string - auto int_req_number_cnt = field_len; + auto int_req_number_cnt = static_cast(field_len); // Off by one if they start with a minus sign if ((*field_begin == '-' || *field_begin == '+') && field_len > 1) { --int_req_number_cnt; } // Off by one if they are a hexadecimal number @@ -200,6 +212,19 @@ __global__ void detect_column_type_kernel(inference_options_view const options, } // for } +/** + * @brief Constructs column type histogram for a given column string input `data`. + * + * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to + * `thrust::tuple` + * + * @param options View of inference options + * @param data JSON string input + * @param column_strings_begin The begining of an offset-length tuple sequence + * @param size Size of the string input + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A histogram containing column-specific type counters + */ template cudf::io::column_type_histogram detect_column_type(inference_options_view const& options, cudf::device_span data, @@ -220,6 +245,22 @@ cudf::io::column_type_histogram detect_column_type(inference_options_view const& return d_column_info.value(stream); } +/** + * @brief Detects data type for a given JSON string input `data`. + * + * @throw cudf::logic_error if input size is 0 + * @throw cudf::logic_error if data type detection failed + * + * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to + * `thrust::tuple` + * + * @param options View of inference options + * @param data JSON string input + * @param column_strings_begin The begining of an offset-length tuple sequence + * @param size Size of the string input + * @param stream CUDA stream used for device memory operations and kernel launches + * @return The detected data type + */ template cudf::data_type detect_data_type(inference_options_view const& options, device_span data, From 015d83d3aff0acc1b541f9dcf71dad9c0da7949d Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 30 Aug 2022 23:33:57 -0700 Subject: [PATCH 101/173] fixes cmake order --- cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ae0c2ce8c01..330fd61da40 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -327,8 +327,8 @@ add_library( src/io/csv/reader_impl.cu src/io/csv/writer_impl.cu src/io/functions.cpp - src/io/json/json_gpu.cu src/io/json/data_casting.cu + src/io/json/json_gpu.cu src/io/json/nested_json_gpu.cu src/io/json/reader_impl.cu src/io/json/experimental/read_json.cpp From c0db3cbcaa3ca928627c2d11580dbf02b001377a Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 31 Aug 2022 07:58:39 -0700 Subject: [PATCH 102/173] refactors string handling --- cpp/src/io/json/data_casting.cuh | 504 ++++++++++++++++--------------- 1 file changed, 268 insertions(+), 236 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index 180fdff1f3d..c730bd3aec8 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -25,12 +25,39 @@ #include #include +#include + #include namespace cudf::io::json::experimental { -constexpr char UNICODE_SEQ = 0x7F; -constexpr char NON_ESCAPE_CHAR = 0x7E; +// Unicode code point escape sequence +static constexpr char UNICODE_SEQ = 0x7F; + +// Invalid escape sequence +static constexpr char NON_ESCAPE_CHAR = 0x7E; + +// Unicode code point escape sequence prefix comprises '\' and 'u' cahrs +static constexpr size_type UNICODE_ESC_PREFIX = 2; + +// Unicode code point escape sequence comprises four hex characters +static constexpr size_type UNICODE_HEX_DIGIT_COUNT = 4; + +// A unicode code point escape sequence is \uXXXX +static auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = UNICODE_ESC_PREFIX + UNICODE_HEX_DIGIT_COUNT; + +static constexpr auto UTF16_HIGH_SURROGATE_BEGIN = 0xD800; +static constexpr auto UTF16_HIGH_SURROGATE_END = 0xDC00; +static constexpr auto UTF16_LOW_SURROGATE_BEGIN = 0xDC00; +static constexpr auto UTF16_LOW_SURROGATE_END = 0xE000; + +/** + * @brief Returns the character to output for a given escaped character that's following a + * backslash. + * + * @param escaped_char The character following the backslash. + * @return The character to output for a given character that's following a backslash + */ __device__ __forceinline__ char get_escape_char(char escaped_char) { switch (escaped_char) { @@ -47,27 +74,32 @@ __device__ __forceinline__ char get_escape_char(char escaped_char) } } -__device__ __forceinline__ int64_t string_to_hex(char const* str) +/** + * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence + * \uXXXX. + * + * @param str Pointer to the first (most-significant) hex digit + * @return The parsed hex value if successful, -1 otherwise. + */ +__device__ __forceinline__ int32_t parse_unicode_hex(char const* str) { - // Unicode code point escape sequence comprises four hex characters - constexpr size_type unicode_hex_digits = 4; - // Prepare result - int64_t result = 0, base = 1; + int32_t result = 0, base = 1; + constexpr int32_t hex_radix = 16; // Iterate over hex digits right-to-left - size_type index = unicode_hex_digits; + size_type index = UNICODE_HEX_DIGIT_COUNT; while (index-- > 0) { char const ch = str[index]; if (ch >= '0' && ch <= '9') { - result += static_cast((ch - '0') + 0) * base; - base *= 16; + result += static_cast((ch - '0') + 0) * base; + base *= hex_radix; } else if (ch >= 'A' && ch <= 'F') { - result += static_cast((ch - 'A') + 10) * base; - base *= 16; + result += static_cast((ch - 'A') + 10) * base; + base *= hex_radix; } else if (ch >= 'a' && ch <= 'f') { - result += static_cast((ch - 'a') + 10) * base; - base *= 16; + result += static_cast((ch - 'a') + 10) * base; + base *= hex_radix; } else { return -1; } @@ -75,6 +107,169 @@ __device__ __forceinline__ int64_t string_to_hex(char const* str) return result; } +/** + * @brief Writes the UTF-8 byte sequence to \p out_it and returns the iterator to one past the + * last item that was written to \p out_it + */ +template +__device__ __forceinline__ out_it_t write_utf8_char(utf8_char_t utf8_chars, out_it_t out_it) +{ + constexpr size_type MAX_UTF8_BYTES_PER_CODE_POINT = 4; + char char_bytes[MAX_UTF8_BYTES_PER_CODE_POINT]; + auto const num_chars_written = + strings::detail::from_char_utf8(utf8_chars, reinterpret_cast(char_bytes)); + + for (size_type i = 0; i < MAX_UTF8_BYTES_PER_CODE_POINT; i++) { + if (i < num_chars_written) { *out_it++ = char_bytes[i]; } + } + return out_it; +} + +/** + * @brief Processes a string, replaces escape sequences and optionally strips off the quote + * characters. + * + * @tparam in_iterator_t A bidirectional input iterator type whose value_type is convertible to + * char + * @tparam out_iterator_t A forward output iterator type + * @param in_begin Iterator to the first item to process + * @param in_end Iterator to one past the last item to process + * @param out_it Iterator to the first item to write + * @param options Settings for controlling string processing behavior + * @return A four-tuple of (in_it_end, out_it_end, set_null, is_invalid), where in_it_end is an + * iterator to one past the last character from the input that was processed, out_it_end is an + * iterator to one past the last character that was written, set_null being true if a null literal + * was read or a parsing error occured, and is_invalid being true if a parsing error was + * encountered + */ +template +__device__ __forceinline__ thrust::tuple process_string( + in_iterator_t in_begin, + in_iterator_t in_end, + out_iterator_t out_it, + cudf::io::parse_options_view const& options) +{ + constexpr bool NULL_FLAG = true; + constexpr bool NOT_NULL_FLAG = false; + constexpr bool INVALID_FLAG = true; + constexpr bool NO_ERROR_FLAG = false; + + auto const num_in_chars = thrust::distance(in_begin, in_end); + + // Check if the value corresponds to the null literal + auto const is_null_literal = + serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); + if (is_null_literal) { return {in_begin, out_it, NULL_FLAG, NO_ERROR_FLAG}; } + + // Whether in the original JSON this was a string value enclosed in quotes + // ({"a":"foo"} vs. {"a":1.23}) + char const quote_char = '"'; + char const backslash_char = '\\'; + + // String values are indicated by keeping the quote character + bool const is_string_value = + num_in_chars >= 2LL && (*in_begin == quote_char) && (*thrust::prev(in_end) == quote_char); + + // Copy literal/numeric value + if (not is_string_value) { + while (in_begin != in_end) { + *out_it++ = *in_begin++; + } + return {in_begin, out_it, NOT_NULL_FLAG, NO_ERROR_FLAG}; + } + + // Escape-flag, set after encountering a backslash character + bool escape = false; + + // Exclude beginning and ending quote chars from string range + if (!options.keepquotes) { + ++in_begin; + --in_end; + } + + // Iterate over the input + while (in_begin != in_end) { + // Copy single character to output + if (!escape) { + escape = (*in_begin == backslash_char); + if (!escape) { *out_it++ = *in_begin; } + in_begin++; + continue; + } + + // Previous char indicated beginning of escape sequence + // Reset escape flag for next loop iteration + escape = false; + + // Check the character that is supposed to be escaped + auto escaped_char = get_escape_char(*in_begin); + + // We escaped an invalid escape character -> "fail"/null for this item + if (escaped_char == NON_ESCAPE_CHAR) { return {in_begin, out_it, NULL_FLAG, INVALID_FLAG}; } + + // Regular, single-character escape + if (escaped_char != UNICODE_SEQ) { + *out_it++ = escaped_char; + ++in_begin; + continue; + } + + // This is an escape sequence of a unicode code point: \uXXXX, + // where each X in XXXX represents a hex digit + // Skip over the 'u' char from \uXXXX to the first hex digit + ++in_begin; + + // Make sure that there's at least 4 characters left from the + // input, which are expected to be hex digits + if (thrust::distance(in_begin, in_end) < UNICODE_HEX_DIGIT_COUNT) { + return {in_begin, out_it, NULL_FLAG, INVALID_FLAG}; + } + + auto hex_val = parse_unicode_hex(in_begin); + + // Couldn't parse hex values from the four-character sequence -> "fail"/null for this item + if (hex_val < 0) { return {in_begin, out_it, NULL_FLAG, INVALID_FLAG}; } + + // Skip over the four hex digits + thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT); + + // If this may be a UTF-16 encoded surrogate pair: + // we expect another \uXXXX sequence + int32_t hex_low_val = 0; + if (thrust::distance(in_begin, in_end) >= NUM_UNICODE_ESC_SEQ_CHARS && + *in_begin == backslash_char && *thrust::next(in_begin) == 'u') { + // Skip over '\' and 'u' chars + thrust::advance(in_begin, UNICODE_ESC_PREFIX); + + // Try to parse hex value from what may be a UTF16 low surrogate + hex_low_val = parse_unicode_hex(in_begin); + } + + // This is indeed a UTF16 surrogate pair + if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && + hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { + // Skip over the second \uXXXX sequence + thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT); + + // Compute UTF16-encoded code point + uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + + (hex_low_val - UTF16_LOW_SURROGATE_BEGIN); + auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); + out_it = write_utf8_char(utf8_chars, out_it); + } + + // Just a single \uXXXX sequence + else { + auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); + out_it = write_utf8_char(utf8_chars, out_it); + } + } + + // The last character of the input is a backslash -> "fail"/null for this item + if (escape) { return {in_begin, out_it, NULL_FLAG, INVALID_FLAG}; } + return {in_begin, out_it, NOT_NULL_FLAG, NO_ERROR_FLAG}; +} + template std::unique_ptr parse_data(str_tuple_it str_tuples, size_type col_size, @@ -86,233 +281,70 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, { if (col_type == cudf::data_type{cudf::type_id::STRING}) { rmm::device_uvector offsets(col_size + 1, stream); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, - sizes = device_span{offsets}, - null_mask = static_cast(null_mask.data()), - options] __device__(size_type row) { - if (not bit_is_set(null_mask, row)) { - sizes[row] = 0; - return; - } - auto const in = str_tuples[row]; - - auto const is_null_literal = - serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); - if (is_null_literal) { - sizes[row] = 0; - clear_bit(null_mask, row); - return; - } - - // Whether in the original JSON this was a string value enclosed in quotes - // ({"a":"foo"} vs. {"a":1.23}) - char const quote_char = '"'; - bool const is_string_value = - in.second >= 2 && (*in.first == quote_char) && (in.first[in.second - 1] == quote_char); - - // Handling non-string values - if (not is_string_value) { - sizes[row] = in.second; - return; - } - - // Strip off quote chars - decltype(in.second) out_size = 0; - - // Escape-flag, set after encountering an escape character - bool escape = false; - - // Exclude beginning and ending quote chars from string range - auto start_index = options.keepquotes ? 0 : 1; - auto end_index = in.second - (options.keepquotes ? 0 : 1); - for (decltype(in.second) i = start_index; i < end_index; ++i) { - // Previous char was an escape char - if (escape) { - // A unicode code point escape sequence is \uXXXX - auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = 6; - // The escape sequence comprises four hex digits - auto constexpr NUM_UNICODE_ESC_HEX_DIGITS = 4; - // A name for the char following the current one - auto constexpr NEXT_CHAR = 1; - // A name for the char after the next char - auto constexpr NEXT_NEXT_CHAR = 2; - // Reset escape flag for next loop iteration - escape = false; - - // Check the character that is supposed to be escaped - auto escaped_char = get_escape_char(in.first[i]); - - // This is an escape sequence of a unicode code point: \uXXXX, - // where each X in XXXX represents a hex digit - if (escaped_char == UNICODE_SEQ) { - // Make sure that there's at least 4 characters left from the - // input, which are expected to be hex digits - if (i + NUM_UNICODE_ESC_HEX_DIGITS < end_index) { - auto hex_val = string_to_hex(&in.first[i + NEXT_CHAR]); - if (hex_val < 0) { - sizes[row] = 0; - clear_bit(null_mask, row); - return; - } - // Skip over the four hex digits - i += NUM_UNICODE_ESC_HEX_DIGITS; - - // If this may be a UTF-16 encoded surrogate pair: - // we expect another \uXXXX sequence - int64_t hex_low_val = 0; - if (i + NUM_UNICODE_ESC_SEQ_CHARS < end_index && in.first[i + NEXT_CHAR] == '\\' && - in.first[i + NEXT_NEXT_CHAR] == 'u') { - hex_low_val = string_to_hex(&in.first[i + 3]); - } - // This is indeed a surrogate pair - if (hex_val >= 0xD800 && hex_low_val >= 0xDC00) { - // Skip over the second \uXXXX sequence - i += NUM_UNICODE_ESC_SEQ_CHARS; - uint32_t unicode_code_point = - 0x10000 + (hex_val - 0xD800) + (hex_low_val - 0xDC00); - auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); - out_size += strings::detail::bytes_in_char_utf8(utf8_chars); - } - // Just a single \uXXXX sequence - else { - auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); - out_size += strings::detail::bytes_in_char_utf8(utf8_chars); - } - } else { - sizes[row] = 0; - clear_bit(null_mask, row); - return; - } - } else if (escaped_char == NON_ESCAPE_CHAR) { - sizes[row] = 0; - clear_bit(null_mask, row); - return; - } else { - out_size++; - } - } else { - escape = in.first[i] == '\\'; - out_size += escape ? 0 : 1; - } - } - if (escape) { - sizes[row] = 0; - clear_bit(null_mask, row); - return; - } - sizes[row] = out_size; - }); + // Compute string sizes of the post-processed strings + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, + sizes = device_span{offsets}, + null_mask = static_cast(null_mask.data()), + options] __device__(size_type row) { + // String at current offset is null, e.g., due to omissions + // ([{"b":"foo"},{"a":"foo"}]) + if (not bit_is_set(null_mask, row)) { + sizes[row] = 0; + return; + } + + auto const in_begin = str_tuples[row].first; + auto const in_end = in_begin + str_tuples[row].second; + auto out_it = cub::DiscardOutputIterator<>{}; + auto const str_process_info = + process_string(in_begin, in_end, out_it, options); + + // The total number of characters that we're supposed to copy out + auto const num_chars_copied_out = + thrust::distance(out_it, thrust::get<1>(str_process_info)); + + // Whether to set this row to null (e.g., when the string corresponds to + // the null literal) + auto const set_null = thrust::get<2>(str_process_info); + + // Whether parsing of this value failed due to invalid input + auto const is_invalid = thrust::get<3>(str_process_info); + + if (set_null || is_invalid) { + sizes[row] = 0; + clear_bit(null_mask, row); + return; + } else { + sizes[row] = num_chars_copied_out; + } + }); + + // Compute offsets for the post-processed strings thrust::exclusive_scan( rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin()); + // Write out post-processed strings (stripping off quotes, replacing escape sequences) rmm::device_uvector chars(offsets.back_element(stream), stream); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col_size, - [str_tuples, - chars = device_span{chars}, - offsets = device_span{offsets}, - null_mask = static_cast(null_mask.data()), - options] __device__(size_type row) { - if (not bit_is_set(null_mask, row)) { return; } - auto const in = str_tuples[row]; - - // Whether in the original JSON this was a string value enclosed in quotes - // ({"a":"foo"} vs. {"a":1.23}) - char const quote_char = '"'; - bool const is_string_value = - in.second >= 2 && (*in.first == quote_char) && (in.first[in.second - 1] == quote_char); - - // Copy literal/numeric value - if (not is_string_value) { - for (int i = 0, j = 0; i < in.second; ++i) { - chars[offsets[row] + j] = *(in.first + i); - j++; - } - return; - } - - // Escape-flag, set after encountering an escape character - bool escape = false; - - // Exclude beginning and ending quote chars from string range - auto start_index = options.keepquotes ? 0 : 1; - auto end_index = in.second - (options.keepquotes ? 0 : 1); - - for (int i = start_index, j = 0; i < end_index; ++i) { - // Previous char was escape char - if (escape) { - // A unicode code point escape sequence is \uXXXX - auto constexpr NUM_UNICODE_ESC_SEQ_CHARS = 6; - // The escape sequence comprises four hex digits - auto constexpr NUM_UNICODE_ESC_HEX_DIGITS = 4; - // A name for the char following the current one - auto constexpr NEXT_CHAR = 1; - // A name for the char after the next char - auto constexpr NEXT_NEXT_CHAR = 2; - // Reset escape flag for next loop iteration - escape = false; - - // Check the character that is supposed to be escaped - auto escaped_char = get_escape_char(in.first[i]); - - // This is an escape sequence of a unicode code point: \uXXXX, - // where each X in XXXX represents a hex digit - if (escaped_char == UNICODE_SEQ) { - // Make sure that there's at least 4 characters left from the - // input, which are expected to be hex digits - if (i + NUM_UNICODE_ESC_HEX_DIGITS < end_index) { - auto hex_val = string_to_hex(&in.first[i + NEXT_CHAR]); - if (hex_val < 0) { return; } - // Skip over the four hex digits - i += NUM_UNICODE_ESC_HEX_DIGITS; - - // If this may be a UTF-16 encoded surrogate pair: - // we expect another \uXXXX sequence - int64_t hex_low_val = 0; - if (i + NUM_UNICODE_ESC_SEQ_CHARS < end_index && in.first[i + NEXT_CHAR] == '\\' && - in.first[i + NEXT_NEXT_CHAR] == 'u') { - hex_low_val = string_to_hex(&in.first[i + 3]); - } - // This is indeed a surrogate pair - if (hex_val >= 0xD800 && hex_low_val >= 0xDC00) { - // Skip over the second \uXXXX sequence - i += NUM_UNICODE_ESC_SEQ_CHARS; - uint32_t unicode_code_point = - 0x10000 + ((hex_val - 0xD800) << 10) + (hex_low_val - 0xDC00); - auto utf8_chars = strings::detail::codepoint_to_utf8(unicode_code_point); - j += strings::detail::from_char_utf8(utf8_chars, &chars[offsets[row] + j]); - } - // Just a single \uXXXX sequence - else { - auto utf8_chars = strings::detail::codepoint_to_utf8(hex_val); - j += strings::detail::from_char_utf8(utf8_chars, &chars[offsets[row] + j]); - } - } else { - return; - } - } else if (escaped_char == NON_ESCAPE_CHAR) { - return; - } else { - chars[offsets[row] + j] = escaped_char; - j++; - } - } else { - escape = in.first[i] == '\\'; - if (!escape) { - chars[offsets[row] + j] = *(in.first + i); - j++; - } - } - } - if (escape) { return; } - }); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + col_size, + [str_tuples, + chars = device_span{chars}, + offsets = device_span{offsets}, + null_mask = static_cast(null_mask.data()), + options] __device__(size_type row) { + if (not bit_is_set(null_mask, row)) { return; } + + auto const in_begin = str_tuples[row].first; + auto const in_end = in_begin + str_tuples[row].second; + auto out_it = &chars[offsets[row]]; + auto const str_process_info = + process_string(in_begin, in_end, out_it, options); + }); return make_strings_column( col_size, std::move(offsets), std::move(chars), std::move(null_mask)); From 44daa5e0acc682161c9e6ad2825b983b1bc1baba Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 31 Aug 2022 09:36:56 -0700 Subject: [PATCH 103/173] fixes corner case for non-surrogate pair unicode escape --- cpp/src/io/json/data_casting.cuh | 11 ++++++----- cpp/tests/io/json_type_cast_test.cu | 5 +++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index c730bd3aec8..fa3a6a21628 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -159,7 +159,7 @@ __device__ __forceinline__ thrust::tuple(num_in_chars)}); - if (is_null_literal) { return {in_begin, out_it, NULL_FLAG, NO_ERROR_FLAG}; } + if (is_null_literal) { return {in_begin, out_it, NULL_FLAG, NO_ERROR_FLAG}; } // Whether in the original JSON this was a string value enclosed in quotes // ({"a":"foo"} vs. {"a":1.23}) @@ -238,18 +238,19 @@ __device__ __forceinline__ thrust::tuple= NUM_UNICODE_ESC_SEQ_CHARS && *in_begin == backslash_char && *thrust::next(in_begin) == 'u') { - // Skip over '\' and 'u' chars - thrust::advance(in_begin, UNICODE_ESC_PREFIX); + // Iterator that skips over '\' and 'u' chars (not yet advancing in_begin, as it may turn out + // to not be a surrogate pair) + auto low_surrogate_digit_it = thrust::next(thrust::next(in_begin)); // Try to parse hex value from what may be a UTF16 low surrogate - hex_low_val = parse_unicode_hex(in_begin); + hex_low_val = parse_unicode_hex(low_surrogate_digit_it); } // This is indeed a UTF16 surrogate pair if (hex_val >= UTF16_HIGH_SURROGATE_BEGIN && hex_val < UTF16_HIGH_SURROGATE_END && hex_low_val >= UTF16_LOW_SURROGATE_BEGIN && hex_low_val < UTF16_LOW_SURROGATE_END) { // Skip over the second \uXXXX sequence - thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT); + thrust::advance(in_begin, NUM_UNICODE_ESC_SEQ_CHARS); // Compute UTF16-encoded code point uint32_t unicode_code_point = 0x10000 + ((hex_val - UTF16_HIGH_SURROGATE_BEGIN) << 10) + diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index dda39e31795..94d7261b934 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -126,6 +126,7 @@ TEST_F(JSONTypeCastTest, StringEscapes) cudf::test::strings_column_wrapper data({ R"("\uD83D\uDE80")", + R"("\uff21\ud83d\ude80\uff21\uff21")", R"("invalid char being escaped escape char\-")", R"("too few hex digits \u12")", R"("too few hex digits for surrogate pair \uD83D\uDE")", @@ -147,8 +148,8 @@ TEST_F(JSONTypeCastTest, StringEscapes) auto col = cudf::io::json::experimental::parse_data( svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); - auto expected = cudf::test::strings_column_wrapper{{"🚀", "", "", "", "\\", "âž©"}, - {true, false, false, false, true, true}}; + auto expected = cudf::test::strings_column_wrapper{{"🚀", "A🚀AA", "", "", "", "\\", "âž©"}, + {true, true, false, false, false, true, true}}; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected); } From bc273ec3229ebeddfbb2c1df0bcdac325cd7621c Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 31 Aug 2022 10:05:50 -0700 Subject: [PATCH 104/173] removes empty src file and improves comments --- cpp/CMakeLists.txt | 1 - cpp/src/io/csv/datetime.cuh | 6 ++++-- cpp/src/io/json/data_casting.cu | 21 --------------------- cpp/src/io/json/data_casting.cuh | 18 +++++++++--------- 4 files changed, 13 insertions(+), 33 deletions(-) delete mode 100644 cpp/src/io/json/data_casting.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 330fd61da40..b6f34db093d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -327,7 +327,6 @@ add_library( src/io/csv/reader_impl.cu src/io/csv/writer_impl.cu src/io/functions.cpp - src/io/json/data_casting.cu src/io/json/json_gpu.cu src/io/json/nested_json_gpu.cu src/io/json/reader_impl.cu diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh index 28ec3ec8895..f37ecc69eb3 100644 --- a/cpp/src/io/csv/datetime.cuh +++ b/cpp/src/io/csv/datetime.cuh @@ -310,6 +310,7 @@ __inline__ __device__ T parse_optional_integer(char const** begin, char const* e * * @param begin Pointer to the first element of the string * @param end Pointer to the first element after the string + * @return Pointer to the first character excluding any leading spaces */ __inline__ __device__ auto skip_spaces(char const* begin, char const* end) { @@ -319,10 +320,11 @@ __inline__ __device__ auto skip_spaces(char const* begin, char const* end) /** * @brief Excludes the prefix from the input range if the string starts with the prefix. * - * @tparam N length on the prefix, plus one - * @param[in, out] begin Pointer to the first element of the string + * @tparam N length of the prefix, plus one + * @param begin Pointer to the first element of the string * @param end Pointer to the first element after the string * @param prefix String we're searching for at the start of the input range + * @return Pointer to the start of the string excluding the prefix */ template __inline__ __device__ auto skip_if_starts_with(char const* begin, diff --git a/cpp/src/io/json/data_casting.cu b/cpp/src/io/json/data_casting.cu deleted file mode 100644 index 1c2cae2c15b..00000000000 --- a/cpp/src/io/json/data_casting.cu +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "data_casting.cuh" - -namespace cudf::io::json::experimental { - -} // namespace cudf::io::json::experimental diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index fa3a6a21628..a9732bbcb53 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -37,7 +37,7 @@ static constexpr char UNICODE_SEQ = 0x7F; // Invalid escape sequence static constexpr char NON_ESCAPE_CHAR = 0x7E; -// Unicode code point escape sequence prefix comprises '\' and 'u' cahrs +// Unicode code point escape sequence prefix comprises '\' and 'u' characters static constexpr size_type UNICODE_ESC_PREFIX = 2; // Unicode code point escape sequence comprises four hex characters @@ -61,14 +61,14 @@ static constexpr auto UTF16_LOW_SURROGATE_END = 0xE000; __device__ __forceinline__ char get_escape_char(char escaped_char) { switch (escaped_char) { - case '"': return 0x22; - case '\\': return 0x5C; - case '/': return 0x2F; - case 'b': return 0x08; - case 'f': return 0x0C; - case 'n': return 0x0A; - case 'r': return 0x0D; - case 't': return 0x09; + case '"': return '"'; + case '\\': return '\\'; + case '/': return '/'; + case 'b': return '\b'; + case 'f': return '\f'; + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; case 'u': return UNICODE_SEQ; default: return NON_ESCAPE_CHAR; } From 4afbd8b82c02bc4c7c4253245a08322013d3008d Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 1 Sep 2022 00:18:42 -0700 Subject: [PATCH 105/173] few style changes --- cpp/src/io/json/data_casting.cuh | 7 +++---- cpp/src/io/utilities/parsing_utils.cuh | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index a9732bbcb53..6325000827d 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -116,8 +116,7 @@ __device__ __forceinline__ out_it_t write_utf8_char(utf8_char_t utf8_chars, out_ { constexpr size_type MAX_UTF8_BYTES_PER_CODE_POINT = 4; char char_bytes[MAX_UTF8_BYTES_PER_CODE_POINT]; - auto const num_chars_written = - strings::detail::from_char_utf8(utf8_chars, reinterpret_cast(char_bytes)); + auto const num_chars_written = strings::detail::from_char_utf8(utf8_chars, char_bytes); for (size_type i = 0; i < MAX_UTF8_BYTES_PER_CODE_POINT; i++) { if (i < num_chars_written) { *out_it++ = char_bytes[i]; } @@ -138,8 +137,8 @@ __device__ __forceinline__ out_it_t write_utf8_char(utf8_char_t utf8_chars, out_ * @param options Settings for controlling string processing behavior * @return A four-tuple of (in_it_end, out_it_end, set_null, is_invalid), where in_it_end is an * iterator to one past the last character from the input that was processed, out_it_end is an - * iterator to one past the last character that was written, set_null being true if a null literal - * was read or a parsing error occured, and is_invalid being true if a parsing error was + * iterator to one past the last character that was written, set_null is true if a null literal + * was read or a parsing error occured, and is_invalid is true if a parsing error was * encountered */ template diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 1da90636465..828bd1fe9da 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -498,9 +498,9 @@ __inline__ __device__ std::pair trim_whitespaces_quote auto const trim_begin = thrust::find_if(thrust::seq, begin, end, not_whitespace); auto const trim_end = thrust::find_if(thrust::seq, - thrust::make_reverse_iterator(end), - thrust::make_reverse_iterator(trim_begin), - not_whitespace); + thrust::make_reverse_iterator(end), + thrust::make_reverse_iterator(trim_begin), + not_whitespace); return {skip_character(trim_begin, quotechar), skip_character(trim_end, quotechar).base()}; } From 8b600aac7ceb721ee1b5f0c53904c698fdefc587 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 1 Sep 2022 00:20:16 -0700 Subject: [PATCH 106/173] avoids superfluous decode definitions --- cpp/src/io/utilities/parsing_utils.cuh | 67 +++----------------------- 1 file changed, 7 insertions(+), 60 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 828bd1fe9da..b2f80f1a224 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -555,62 +555,6 @@ __inline__ __device__ T decode_value(char const* begin, char const* end, parse_o return to_duration(begin, end); } -// The purpose of these is merely to allow compilation ONLY -template <> -__inline__ __device__ cudf::string_view decode_value(const char*, - const char*, - parse_options_view const&) -{ - return cudf::string_view{}; -} - -template <> -__inline__ __device__ cudf::dictionary32 decode_value(const char*, - const char*, - parse_options_view const&) -{ - return cudf::dictionary32{}; -} - -template <> -__inline__ __device__ cudf::list_view decode_value(const char*, - const char*, - parse_options_view const&) -{ - return cudf::list_view{}; -} -template <> -__inline__ __device__ cudf::struct_view decode_value(const char*, - const char*, - parse_options_view const&) -{ - return cudf::struct_view{}; -} - -template <> -__inline__ __device__ numeric::decimal32 decode_value(const char*, - const char*, - parse_options_view const&) -{ - return numeric::decimal32{}; -} - -template <> -__inline__ __device__ numeric::decimal64 decode_value(const char*, - const char*, - parse_options_view const&) -{ - return numeric::decimal64{}; -} - -template <> -__inline__ __device__ numeric::decimal128 decode_value(const char*, - const char*, - parse_options_view const&) -{ - return numeric::decimal128{}; -} - struct ConvertFunctor { /** * @brief Dispatch for numeric types whose values can be convertible to @@ -707,7 +651,7 @@ struct ConvertFunctor { } /** - * @brief Dispatch for all other types. + * @brief Dispatch for remaining supported types, i.e., timestamp and duration types. */ template and !std::is_floating_point_v and @@ -720,9 +664,12 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - static_cast(out_buffer)[row] = decode_value(begin, end, opts); - - return true; + if constexpr (cudf::is_timestamp() or cudf::is_duration()) { + static_cast(out_buffer)[row] = decode_value(begin, end, opts); + return true; + } else { + return false; + } } }; From 695aefc5d9896754f1ff0c693dd93c9a69913592 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 1 Sep 2022 00:24:29 -0700 Subject: [PATCH 107/173] using CUDF_ENABLE_IF macro --- cpp/src/io/utilities/parsing_utils.cuh | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index b2f80f1a224..572ca2a8642 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -109,7 +109,7 @@ struct parse_options { * * @return uint8_t Numeric value of the character, or `0` */ -template >* = nullptr> +template )> constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; @@ -130,7 +130,7 @@ constexpr uint8_t decode_digit(char c, bool* valid_flag) * * @return uint8_t Numeric value of the character, or `0` */ -template >* = nullptr> +template )> constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; @@ -532,8 +532,7 @@ __inline__ __device__ T decode_value(const char* begin, * * @return The parsed numeric value */ -template () and !cudf::is_duration()>* = nullptr> +template () and !cudf::is_duration())> __inline__ __device__ T decode_value(const char* begin, const char* end, parse_options_view const& opts) @@ -541,7 +540,7 @@ __inline__ __device__ T decode_value(const char* begin, return cudf::io::parse_numeric(begin, end, opts); } -template ()>* = nullptr> +template ())> __inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const& opts) @@ -549,7 +548,7 @@ __inline__ __device__ T decode_value(char const* begin, return to_timestamp(begin, end, opts.dayfirst); } -template ()>* = nullptr> +template ())> __inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&) { return to_duration(begin, end); @@ -590,7 +589,7 @@ struct ConvertFunctor { * * @return bool Whether the parsed value is valid. */ - template ()>* = nullptr> + template ())> __host__ __device__ __forceinline__ bool operator()(char const* begin, char const* end, void* out_buffer, @@ -611,7 +610,7 @@ struct ConvertFunctor { /** * @brief Dispatch for boolean type types. */ - template >* = nullptr> + template )> __host__ __device__ __forceinline__ bool operator()(char const* begin, char const* end, void* out_buffer, @@ -635,7 +634,7 @@ struct ConvertFunctor { * @brief Dispatch for floating points, which are set to NaN if the input * is not valid. In such case, the validity mask is set to zero too. */ - template >* = nullptr> + template )> __host__ __device__ __forceinline__ bool operator()(char const* begin, char const* end, void* out_buffer, From f625a7b3f97e3d46b9671432e85fe2aa656b049b Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 1 Sep 2022 00:30:32 -0700 Subject: [PATCH 108/173] fixes a few remaining west consts --- cpp/src/io/utilities/parsing_utils.cuh | 30 +++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 572ca2a8642..2b6d6e815c7 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -176,8 +176,8 @@ constexpr bool is_infinity(char const* begin, char const* end) * @return The parsed and converted value */ template -constexpr T parse_numeric(const char* begin, - const char* end, +constexpr T parse_numeric(char const* begin, + char const* end, parse_options_view const& opts, T error_result = std::numeric_limits::quiet_NaN()) { @@ -226,7 +226,7 @@ constexpr T parse_numeric(const char* begin, // Handle exponential part of the number if necessary if (begin < end) { - const int32_t exponent_sign = *begin == '-' ? -1 : 1; + int32_t const exponent_sign = *begin == '-' ? -1 : 1; if (*begin == '-' || *begin == '+') { ++begin; } int32_t exponent = 0; while (begin < end) { @@ -311,7 +311,7 @@ __device__ __inline__ char const* seek_field_end(char const* begin, * than or equal to golden data */ template -__device__ __inline__ bool less_equal_than(const char* data, const char (&golden)[N]) +__device__ __inline__ bool less_equal_than(char const* data, char const (&golden)[N]) { auto mismatch_pair = thrust::mismatch(thrust::seq, data, data + N - 1, golden); if (mismatch_pair.first != data + N - 1) { @@ -427,7 +427,7 @@ cudf::size_type find_all_from_set(device_span data, */ template cudf::size_type find_all_from_set(host_span data, - const std::vector& keys, + std::vector const& keys, uint64_t result_offset, T* positions, rmm::cuda_stream_view stream); @@ -461,7 +461,7 @@ cudf::size_type count_all_from_set(device_span data, * @return cudf::size_type total number of occurrences */ cudf::size_type count_all_from_set(host_span data, - const std::vector& keys, + std::vector const& keys, rmm::cuda_stream_view stream); /** @@ -516,8 +516,8 @@ __inline__ __device__ std::pair trim_whitespaces_quote * @return The parsed numeric value */ template -__inline__ __device__ T decode_value(const char* begin, - const char* end, +__inline__ __device__ T decode_value(char const* begin, + char const* end, parse_options_view const& opts) { return cudf::io::parse_numeric(begin, end, opts); @@ -533,8 +533,8 @@ __inline__ __device__ T decode_value(const char* begin, * @return The parsed numeric value */ template () and !cudf::is_duration())> -__inline__ __device__ T decode_value(const char* begin, - const char* end, +__inline__ __device__ T decode_value(char const* begin, + char const* end, parse_options_view const& opts) { return cudf::io::parse_numeric(begin, end, opts); @@ -569,7 +569,7 @@ struct ConvertFunctor { char const* end, void* out_buffer, size_t row, - const data_type output_type, + data_type const output_type, parse_options_view const& opts, bool as_hex = false) { @@ -594,7 +594,7 @@ struct ConvertFunctor { char const* end, void* out_buffer, size_t row, - const data_type output_type, + data_type const output_type, parse_options_view const& opts, bool as_hex) { @@ -615,7 +615,7 @@ struct ConvertFunctor { char const* end, void* out_buffer, size_t row, - const data_type output_type, + data_type const output_type, parse_options_view const& opts, bool as_hex) { @@ -639,7 +639,7 @@ struct ConvertFunctor { char const* end, void* out_buffer, size_t row, - const data_type output_type, + data_type const output_type, parse_options_view const& opts, bool as_hex) { @@ -659,7 +659,7 @@ struct ConvertFunctor { char const* end, void* out_buffer, size_t row, - const data_type output_type, + data_type const output_type, parse_options_view const& opts, bool as_hex) { From ce5ab554594d03e2b4dc2ecb2e9d71d9d57ddb1b Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 1 Sep 2022 00:47:39 -0700 Subject: [PATCH 109/173] adds test coverage for remaining esc sequences --- cpp/tests/io/json_type_cast_test.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 94d7261b934..fb6851c14e7 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -132,6 +132,7 @@ TEST_F(JSONTypeCastTest, StringEscapes) R"("too few hex digits for surrogate pair \uD83D\uDE")", R"("\u005C")", R"("\u27A9")", + R"("\"\\\/\b\f\n\r\t")", }); auto d_column = cudf::column_device_view::create(data); rmm::device_uvector> svs(d_column->size(), stream); @@ -148,8 +149,9 @@ TEST_F(JSONTypeCastTest, StringEscapes) auto col = cudf::io::json::experimental::parse_data( svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); - auto expected = cudf::test::strings_column_wrapper{{"🚀", "A🚀AA", "", "", "", "\\", "âž©"}, - {true, true, false, false, false, true, true}}; + auto expected = + cudf::test::strings_column_wrapper{{"🚀", "A🚀AA", "", "", "", "\\", "âž©", "\"\\/\b\f\n\r\t"}, + {true, true, false, false, false, true, true, true}}; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected); } From bb82f7e44e2942a7dffb039b0b5fb3146a63cb2d Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 1 Sep 2022 02:38:34 -0700 Subject: [PATCH 110/173] adds more comments --- cpp/src/io/json/data_casting.cuh | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/src/io/json/data_casting.cuh index 6325000827d..88de7d9641d 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/src/io/json/data_casting.cuh @@ -218,8 +218,8 @@ __device__ __forceinline__ thrust::tuple std::unique_ptr parse_data(str_tuple_it str_tuples, size_type col_size, From b431641942b379a99a39a766f867d05156363aed Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 1 Sep 2022 02:51:08 -0700 Subject: [PATCH 111/173] fixes style --- cpp/src/io/utilities/parsing_utils.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 2b6d6e815c7..5c6c100a35b 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -498,9 +498,9 @@ __inline__ __device__ std::pair trim_whitespaces_quote auto const trim_begin = thrust::find_if(thrust::seq, begin, end, not_whitespace); auto const trim_end = thrust::find_if(thrust::seq, - thrust::make_reverse_iterator(end), - thrust::make_reverse_iterator(trim_begin), - not_whitespace); + thrust::make_reverse_iterator(end), + thrust::make_reverse_iterator(trim_begin), + not_whitespace); return {skip_character(trim_begin, quotechar), skip_character(trim_end, quotechar).base()}; } From 8d5009903035d3efd41ad8ebb804d019fef9eea5 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 1 Sep 2022 04:29:21 -0700 Subject: [PATCH 112/173] moves data_casting to include --- .../io/json => include/cudf/io/detail}/data_casting.cuh | 4 ++-- cpp/tests/io/json_type_cast_test.cu | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) rename cpp/{src/io/json => include/cudf/io/detail}/data_casting.cuh (99%) diff --git a/cpp/src/io/json/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh similarity index 99% rename from cpp/src/io/json/data_casting.cuh rename to cpp/include/cudf/io/detail/data_casting.cuh index 88de7d9641d..98d5ecd0662 100644 --- a/cpp/src/io/json/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -29,7 +29,7 @@ #include -namespace cudf::io::json::experimental { +namespace cudf::io::json::experimental::detail { // Unicode code point escape sequence static constexpr char UNICODE_SEQ = 0x7F; @@ -398,4 +398,4 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, return out_col; } -} // namespace cudf::io::json::experimental +} // namespace cudf::io::json::experimental::detail diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index fb6851c14e7..87ba50dcda4 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -24,11 +24,11 @@ #include #include #include +#include #include #include #include #include -#include #include @@ -80,7 +80,7 @@ TEST_F(JSONTypeCastTest, String) auto null_mask = cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()); - auto str_col = cudf::io::json::experimental::parse_data( + auto str_col = cudf::io::json::experimental::detail::parse_data( svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); auto out_valids = @@ -110,7 +110,7 @@ TEST_F(JSONTypeCastTest, Int) auto null_mask = cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()); - auto col = cudf::io::json::experimental::parse_data( + auto col = cudf::io::json::experimental::detail::parse_data( svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); auto expected = @@ -146,7 +146,7 @@ TEST_F(JSONTypeCastTest, StringEscapes) auto null_mask = cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()); - auto col = cudf::io::json::experimental::parse_data( + auto col = cudf::io::json::experimental::detail::parse_data( svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); auto expected = From 975f30cb08c66ece0f89f4e7482c5e2eaed942fd Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 1 Sep 2022 10:53:54 -0700 Subject: [PATCH 113/173] more cleanups --- cpp/include/cudf/io/detail/data_casting.cuh | 9 +++------ cpp/src/io/utilities/parsing_utils.cuh | 8 ++++---- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 98d5ecd0662..f6d9cb18b45 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -237,12 +237,9 @@ __device__ __forceinline__ thrust::tuple= NUM_UNICODE_ESC_SEQ_CHARS && *in_begin == backslash_char && *thrust::next(in_begin) == 'u') { - // Iterator that skips over '\' and 'u' chars (not yet advancing in_begin, as it may turn out - // to not be a surrogate pair) - auto low_surrogate_digit_it = thrust::next(thrust::next(in_begin)); - - // Try to parse hex value from what may be a UTF16 low surrogate - hex_low_val = parse_unicode_hex(low_surrogate_digit_it); + // Try to parse hex value following the '\' and 'u' characters from what may be a UTF16 low + // surrogate + hex_low_val = parse_unicode_hex(thrust::next(in_begin, 2)); } // This is indeed a UTF16 surrogate pair diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 5c6c100a35b..a3699acb934 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -563,8 +563,8 @@ struct ConvertFunctor { * @return bool Whether the parsed value is valid. */ template and !std::is_same_v and - !cudf::is_fixed_point()>* = nullptr> + CUDF_ENABLE_IF(std::is_integral_v and !std::is_same_v and + !cudf::is_fixed_point())> __host__ __device__ __forceinline__ bool operator()(char const* begin, char const* end, void* out_buffer, @@ -653,8 +653,8 @@ struct ConvertFunctor { * @brief Dispatch for remaining supported types, i.e., timestamp and duration types. */ template and !std::is_floating_point_v and - !cudf::is_fixed_point()>* = nullptr> + CUDF_ENABLE_IF(!std::is_integral_v and !std::is_floating_point_v and + !cudf::is_fixed_point())> __host__ __device__ __forceinline__ bool operator()(char const* begin, char const* end, void* out_buffer, From 70efb99edd14cc512ff0e04f1f7b19c1acada099 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 1 Sep 2022 21:58:37 -0700 Subject: [PATCH 114/173] switches to return struct instead of tuple --- cpp/include/cudf/io/detail/data_casting.cuh | 61 ++++++++++++--------- cpp/tests/io/json_type_cast_test.cu | 7 ++- 2 files changed, 39 insertions(+), 29 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index f6d9cb18b45..3862b9e033d 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -51,6 +51,25 @@ static constexpr auto UTF16_HIGH_SURROGATE_END = 0xDC00; static constexpr auto UTF16_LOW_SURROGATE_BEGIN = 0xDC00; static constexpr auto UTF16_LOW_SURROGATE_END = 0xE000; +/** + * @brief Describing whether data casting of a certain item succeed, the item was parsed to null, or + * whether type casting failed. + */ +enum class data_casting_result { PARSING_SUCCESS, PARSED_TO_NULL, PARSING_FAILURE }; + +/** + * @brief Providing additional information about the type casting result. + */ +template +struct data_casting_result_info { + // One past the last input element that was parsed + in_iterator_t input_parsed_end; + // One past the last output element that was written + out_iterator_t output_processed_end; + // Whether parsing succeeded, item was parsed to null, or failed + data_casting_result result; +}; + /** * @brief Returns the character to output for a given escaped character that's following a * backslash. @@ -142,23 +161,18 @@ __device__ __forceinline__ out_it_t write_utf8_char(utf8_char_t utf8_chars, out_ * encountered */ template -__device__ __forceinline__ thrust::tuple process_string( +__device__ __forceinline__ data_casting_result_info process_string( in_iterator_t in_begin, in_iterator_t in_end, out_iterator_t out_it, cudf::io::parse_options_view const& options) { - constexpr bool NULL_FLAG = true; - constexpr bool NOT_NULL_FLAG = false; - constexpr bool INVALID_FLAG = true; - constexpr bool NO_ERROR_FLAG = false; - auto const num_in_chars = thrust::distance(in_begin, in_end); // Check if the value corresponds to the null literal auto const is_null_literal = serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); - if (is_null_literal) { return {in_begin, out_it, NULL_FLAG, NO_ERROR_FLAG}; } + if (is_null_literal) { return {in_begin, out_it, data_casting_result::PARSED_TO_NULL}; } // Whether in the original JSON this was a string value enclosed in quotes // ({"a":"foo"} vs. {"a":1.23}) @@ -174,7 +188,7 @@ __device__ __forceinline__ thrust::tuple "fail"/null for this item - if (escaped_char == NON_ESCAPE_CHAR) { return {in_begin, out_it, NULL_FLAG, INVALID_FLAG}; } + if (escaped_char == NON_ESCAPE_CHAR) { + return {in_begin, out_it, data_casting_result::PARSING_FAILURE}; + } // Regular, single-character escape if (escaped_char != UNICODE_SEQ) { @@ -221,13 +237,13 @@ __device__ __forceinline__ thrust::tuple "fail"/null for this item - if (hex_val < 0) { return {in_begin, out_it, NULL_FLAG, INVALID_FLAG}; } + if (hex_val < 0) { return {in_begin, out_it, data_casting_result::PARSING_FAILURE}; } // Skip over the four hex digits thrust::advance(in_begin, UNICODE_HEX_DIGIT_COUNT); @@ -263,8 +279,8 @@ __device__ __forceinline__ thrust::tuple "fail"/null for this item - if (escape) { return {in_begin, out_it, NULL_FLAG, INVALID_FLAG}; } - return {in_begin, out_it, NOT_NULL_FLAG, NO_ERROR_FLAG}; + if (escape) { return {in_begin, out_it, data_casting_result::PARSING_FAILURE}; } + return {in_begin, out_it, data_casting_result::PARSING_SUCCESS}; } /** @@ -314,19 +330,13 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, // The total number of characters that we're supposed to copy out auto const num_chars_copied_out = - thrust::distance(out_it, thrust::get<1>(str_process_info)); - - // Whether to set this row to null (e.g., when the string corresponds to - // the null literal) - auto const set_null = thrust::get<2>(str_process_info); - - // Whether parsing of this value failed due to invalid input - auto const is_invalid = thrust::get<3>(str_process_info); + thrust::distance(out_it, str_process_info.output_processed_end); - if (set_null || is_invalid) { + // If, during parsing, an error occured or we parsed the null literal -> + // set to null + if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { sizes[row] = 0; clear_bit(null_mask, row); - return; } else { sizes[row] = num_chars_copied_out; } @@ -351,8 +361,7 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, auto const in_begin = str_tuples[row].first; auto const in_end = in_begin + str_tuples[row].second; auto out_it = &chars[offsets[row]]; - auto const str_process_info = - process_string(in_begin, in_end, out_it, options); + process_string(in_begin, in_end, out_it, options); }); return make_strings_column( diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 87ba50dcda4..b5c97a5f6c2 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -132,6 +132,7 @@ TEST_F(JSONTypeCastTest, StringEscapes) R"("too few hex digits for surrogate pair \uD83D\uDE")", R"("\u005C")", R"("\u27A9")", + R"("escape with nothing to escape \")", R"("\"\\\/\b\f\n\r\t")", }); auto d_column = cudf::column_device_view::create(data); @@ -149,9 +150,9 @@ TEST_F(JSONTypeCastTest, StringEscapes) auto col = cudf::io::json::experimental::detail::parse_data( svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr); - auto expected = - cudf::test::strings_column_wrapper{{"🚀", "A🚀AA", "", "", "", "\\", "âž©", "\"\\/\b\f\n\r\t"}, - {true, true, false, false, false, true, true, true}}; + auto expected = cudf::test::strings_column_wrapper{ + {"🚀", "A🚀AA", "", "", "", "\\", "âž©", "", "\"\\/\b\f\n\r\t"}, + {true, true, false, false, false, true, true, false, true}}; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(col->view(), expected); } From fb78f6bdb5fdc7a63f9183da331c3f8fee041ee4 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 2 Sep 2022 00:30:37 -0700 Subject: [PATCH 115/173] integrates upstream interface changes --- cpp/src/io/json/nested_json_gpu.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 02d6f7cfc94..0b6c61b8891 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -26,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -1514,13 +1514,13 @@ std::pair, std::vector> json_column_to inference_options(options).view(), d_input, string_ranges_it, col_size, stream); // Convert strings to the inferred data type - auto col = cudf::io::json::experimental::parse_data(string_spans_it, - col_size, - target_type, - make_validity(json_col).first, - casting_options(options).view(), - stream, - mr); + auto col = cudf::io::json::experimental::detail::parse_data(string_spans_it, + col_size, + target_type, + make_validity(json_col).first, + casting_options(options).view(), + stream, + mr); // Reset nullable if we do not have nulls if (col->null_count() == 0) { col->set_null_mask({}); } From 73b20f549cb3a37116cd6aec2059b298e7b684a4 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 2 Sep 2022 03:40:02 -0700 Subject: [PATCH 116/173] minor fixes and clarifications --- cpp/include/cudf/io/json.hpp | 2 +- cpp/src/io/json/nested_json_gpu.cu | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 8427b4c5e43..a557b6e0a24 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -209,7 +209,7 @@ class json_reader_options { /** * @brief Whether the experimental reader should keep quotes of string values. * - * @returns true the experimental reader should keep quotes, false otherwise + * @returns true if the experimental reader should keep quotes, false otherwise */ bool is_keeping_quotes() const { return _keep_quotes; } diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 0b6c61b8891..37313e3f24f 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1441,7 +1441,7 @@ auto casting_options(cudf::io::json_reader_options const& options) { auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; - auto const stream = rmm::cuda_stream_default; + auto const stream = cudf::default_stream_value; parse_opts.keepquotes = options.is_keeping_quotes(); parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); @@ -1453,7 +1453,7 @@ auto inference_options(cudf::io::json_reader_options const& options) { cudf::io::detail::inference_options parse_opts{}; - auto const stream = rmm::cuda_stream_default; + auto const stream = cudf::default_stream_value; parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); parse_opts.trie_na = cudf::detail::create_serialized_trie({"null"}, stream); @@ -1602,7 +1602,10 @@ table_with_metadata parse_nested_json(host_span input, constexpr uint32_t token_begin_offset_zero = 0; constexpr uint32_t token_end_offset_zero = 0; constexpr uint32_t node_init_child_count_zero = 0; - constexpr bool include_quote_chars = true; + + // Whether the tokenizer stage should keep quote characters for string values + // If the tokenizer keeps the quote characters, they may be stripped during type casting + constexpr bool include_quote_chars = true; // We initialize the very root node and root column, which represent the JSON document being // parsed. That root node is a list node and that root column is a list column. The column has the From 01643968ce373f2894375110bda32fd6bc89d130 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 5 Sep 2022 03:52:27 -0700 Subject: [PATCH 117/173] rewrites switch statement in tokenizer stage --- cpp/src/io/json/nested_json_gpu.cu | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 37313e3f24f..0e05f0e61e5 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1104,23 +1104,15 @@ void make_json_column(json_column& root_column, auto get_token_index = [include_quote_char](PdaTokenT const token, SymbolOffsetT const token_index) { constexpr SymbolOffsetT quote_char_size = 1; - if (include_quote_char) { - switch (token) { - // Include trailing quote char for string values excluded for StringEnd - case token_t::StringEnd: return token_index + quote_char_size; - // Strip off quote char included for FieldNameBegin - case token_t::FieldNameBegin: return token_index + quote_char_size; - default: return token_index; - }; - } else { - switch (token) { - // Strip off quote char included for StringBegin - case token_t::StringBegin: return token_index + quote_char_size; - // Strip off quote char included for FieldNameBegin - case token_t::FieldNameBegin: return token_index + quote_char_size; - default: return token_index; - }; - } + switch (token) { + // Optionally strip off quote char included for StringBegin + case token_t::StringBegin: return token_index + (include_quote_char ? 0 : quote_char_size); + // Optionally include trailing quote char for string values excluded for StringEnd + case token_t::StringEnd: return token_index + (include_quote_char ? quote_char_size : 0); + // Strip off quote char included for FieldNameBegin + case token_t::FieldNameBegin: return token_index + quote_char_size; + default: return token_index; + }; }; // The end-of-* partner token for a given beginning-of-* token From 053eca8071e509d56a77924c1eb336378ab18331 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 6 Sep 2022 01:53:29 -0700 Subject: [PATCH 118/173] fixes column order for deeply nested JSON --- cpp/src/io/json/nested_json_gpu.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 0e05f0e61e5..7976ba2f1ac 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1525,9 +1525,10 @@ std::pair, std::vector> json_column_to std::vector column_names{}; size_type num_rows{json_col.current_offset}; // Create children columns - for (auto const& col : json_col.child_columns) { - column_names.emplace_back(col.first); - auto const& child_col = col.second; + for (auto const& col_name : json_col.column_order) { + auto const& col = json_col.child_columns.find(col_name); + column_names.emplace_back(col->first); + auto const& child_col = col->second; auto [child_column, names] = json_column_to_cudf_column(child_col, d_input, options, stream, mr); CUDF_EXPECTS(num_rows == child_column->size(), From b461fd867f9f86c80596f1fa4c1d8b6033054f3e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 6 Sep 2022 11:51:18 -0400 Subject: [PATCH 119/173] Update cpp/src/io/utilities/type_inference.cuh Co-authored-by: Elias Stehle <3958403+elstehle@users.noreply.github.com> --- cpp/src/io/utilities/type_inference.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index ed7ec8c304c..2b6e941ba36 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -215,8 +215,8 @@ __global__ void detect_column_type_kernel(inference_options_view const options, /** * @brief Constructs column type histogram for a given column string input `data`. * - * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to - * `thrust::tuple` + * @tparam ColumnStringIter Iterator type whose `value_type` is a + * `thrust::tuple`, where `offset_t` and `length_t` are of integral type and `offset_t` needs to be convertible to `std::size_t`. * * @param options View of inference options * @param data JSON string input From c2b03938060a0be67551fcdc4f2bfdfed3662d6c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 6 Sep 2022 11:51:40 -0400 Subject: [PATCH 120/173] Update cpp/src/io/utilities/type_inference.cuh Co-authored-by: Elias Stehle <3958403+elstehle@users.noreply.github.com> --- cpp/src/io/utilities/type_inference.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 2b6e941ba36..d5229aef713 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -111,8 +111,8 @@ __device__ __inline__ bool is_like_float( /** * @brief Constructs column type histogram for a given column string input `data`. * - * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to - * `thrust::tuple` + * @tparam ColumnStringIter Iterator type whose `value_type` is a + * `thrust::tuple`, where `offset_t` and `length_t` are of integral type and `offset_t` needs to be convertible to `std::size_t`. * * @param[in] options View of inference options * @param[in] data JSON string input From 529d5f2d3547e182f217321c1a1b1c9b4c53d040 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 6 Sep 2022 11:54:54 -0400 Subject: [PATCH 121/173] Cleanups: comment, remove unused header + use const var --- cpp/src/io/utilities/type_inference.cuh | 2 +- cpp/tests/io/type_inference_test.cu | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index ed7ec8c304c..38d36c91b69 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -209,7 +209,7 @@ __global__ void detect_column_type_kernel(inference_options_view const options, colon_count <= 2) { atomicAdd(&column_info->datetime_count, 1); } - } // for + } // grid-stride for loop } /** diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index 3afe0556da6..ce6ca3897f0 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -39,8 +38,8 @@ struct TypeInference : public cudf::test::BaseFixture { TEST_F(TypeInference, Basic) { - auto stream = rmm::cuda_stream_default; - auto options = inference_options{}; + auto const stream = rmm::cuda_stream_default; + auto options = inference_options{}; options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); From 9f4907c71c41ed92b6a194a645ab7c3d9a6e64bb Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 6 Sep 2022 12:07:44 -0400 Subject: [PATCH 122/173] Code formatting --- cpp/src/io/utilities/type_inference.cuh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index a97004b3a6e..20fd57a74db 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -111,8 +111,9 @@ __device__ __inline__ bool is_like_float( /** * @brief Constructs column type histogram for a given column string input `data`. * - * @tparam ColumnStringIter Iterator type whose `value_type` is a - * `thrust::tuple`, where `offset_t` and `length_t` are of integral type and `offset_t` needs to be convertible to `std::size_t`. + * @tparam ColumnStringIter Iterator type whose `value_type` is a + * `thrust::tuple`, where `offset_t` and `length_t` are of integral type and + * `offset_t` needs to be convertible to `std::size_t`. * * @param[in] options View of inference options * @param[in] data JSON string input @@ -215,8 +216,9 @@ __global__ void detect_column_type_kernel(inference_options_view const options, /** * @brief Constructs column type histogram for a given column string input `data`. * - * @tparam ColumnStringIter Iterator type whose `value_type` is a - * `thrust::tuple`, where `offset_t` and `length_t` are of integral type and `offset_t` needs to be convertible to `std::size_t`. + * @tparam ColumnStringIter Iterator type whose `value_type` is a + * `thrust::tuple`, where `offset_t` and `length_t` are of integral type and + * `offset_t` needs to be convertible to `std::size_t`. * * @param options View of inference options * @param data JSON string input From be98e8475270e7134a451bf7a725de10ca54a555 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 6 Sep 2022 12:14:43 -0400 Subject: [PATCH 123/173] Use fixed-width integers --- cpp/src/io/utilities/type_inference.cuh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 20fd57a74db..d2796c04ec2 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -87,8 +87,11 @@ __device__ __inline__ bool is_digit(char const c, bool const is_hex = false) * False positives are possible because positions are not taken into account. * For example, field "e.123-" would match the pattern. */ -__device__ __inline__ bool is_like_float( - long len, long digit_cnt, long decimal_cnt, long dash_cnt, long exponent_cnt) +__device__ __inline__ bool is_like_float(std::size_t len, + uint32_t digit_cnt, + uint32_t decimal_cnt, + uint32_t dash_cnt, + uint32_t exponent_cnt) { // Can't have more than one exponent and one decimal point if (decimal_cnt > 1) return false; From f864f19271820680bea8d81bd8ec651529cd3fb1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 6 Sep 2022 14:13:54 -0400 Subject: [PATCH 124/173] Add omission null count handling --- cpp/src/io/utilities/type_inference.cuh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index d2796c04ec2..e162e29ec09 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -245,7 +245,7 @@ cudf::io::column_type_histogram detect_column_type(inference_options_view const& d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value())); detect_column_type_kernel<<>>( - options, data, column_strings_begin, size, d_column_info.data()); + options, data, column_strings_begin, omission_null_count, size, d_column_info.data()); return d_column_info.value(stream); } @@ -262,6 +262,7 @@ cudf::io::column_type_histogram detect_column_type(inference_options_view const& * @param options View of inference options * @param data JSON string input * @param column_strings_begin The begining of an offset-length tuple sequence + * @param omission_null_count Number of omitted nulls * @param size Size of the string input * @param stream CUDA stream used for device memory operations and kernel launches * @return The detected data type @@ -270,6 +271,7 @@ template cudf::data_type detect_data_type(inference_options_view const& options, device_span data, ColumnStringIter column_strings_begin, + cudf::size_type omission_null_count, std::size_t const size, rmm::cuda_stream_view stream) { @@ -280,14 +282,15 @@ cudf::data_type detect_data_type(inference_options_view const& options, auto get_type_id = [&](auto const& cinfo) { auto int_count_total = cinfo.big_int_count + cinfo.negative_small_int_count + cinfo.positive_small_int_count; - if (cinfo.null_count == static_cast(size)) { + if ((cinfo.null_count + omission_null_count) == static_cast(size)) { // Entire column is NULL; allocate the smallest amount of memory return type_id::INT8; } else if (cinfo.string_count > 0) { return type_id::STRING; } else if (cinfo.datetime_count > 0) { return type_id::TIMESTAMP_MILLISECONDS; - } else if (cinfo.float_count > 0 || (int_count_total > 0 && cinfo.null_count > 0)) { + } else if (cinfo.float_count > 0 || + (int_count_total > 0 && (cinfo.null_count + omission_null_count) > 0)) { return type_id::FLOAT64; } else if (cinfo.big_int_count == 0 && int_count_total != 0) { return type_id::INT64; From ea8493ab4f5abb31eb0d052cf68a42b460421aaf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 6 Sep 2022 14:17:21 -0400 Subject: [PATCH 125/173] Update tests --- cpp/src/io/utilities/type_inference.cuh | 2 +- cpp/tests/io/type_inference_test.cu | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index e162e29ec09..899a51e55d2 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -245,7 +245,7 @@ cudf::io::column_type_histogram detect_column_type(inference_options_view const& d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value())); detect_column_type_kernel<<>>( - options, data, column_strings_begin, omission_null_count, size, d_column_info.data()); + options, data, column_strings_begin, size, d_column_info.data()); return d_column_info.value(stream); } diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index ce6ca3897f0..a8c97b93f38 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -59,7 +59,9 @@ TEST_F(TypeInference, Basic) auto d_col_strings = thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - auto res_type = detect_data_type(options.view(), d_data, d_col_strings, size, stream); + cudf::size_type constexpr num_omitted_nulls = 0; + auto res_type = + detect_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT64}); } From 5b46d336aceeb7393b081d15ce063b5c5b3d4f9b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 6 Sep 2022 15:58:01 -0400 Subject: [PATCH 126/173] Add null test --- cpp/tests/io/type_inference_test.cu | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index a8c97b93f38..9f6aba44a2c 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -65,3 +65,34 @@ TEST_F(TypeInference, Basic) EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT64}); } + +TEST_F(TypeInference, OmittedNull) +{ + auto const stream = rmm::cuda_stream_default; + auto options = inference_options{}; + + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = "[52,5]"; + rmm::device_uvector d_data{data.size(), stream}; + cudaMemcpyAsync( + d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + + std::size_t constexpr size = 3; + auto const string_offset = std::vector{1, 1, 4}; + auto const string_length = std::vector{0, 2, 1}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); + + cudf::size_type constexpr num_omitted_nulls = 1; + auto res_type = + detect_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + + EXPECT_EQ(res_type, + cudf::data_type{cudf::type_id::FLOAT64}); // FLOAT64 to align with pandas's behavior +} From 0d81a334eb55f9107cac6c10ecbb87f9f02476b3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 6 Sep 2022 18:03:45 -0400 Subject: [PATCH 127/173] Add more tests --- cpp/tests/io/type_inference_test.cu | 90 +++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index 9f6aba44a2c..5fa4b359d06 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -96,3 +96,93 @@ TEST_F(TypeInference, OmittedNull) EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::FLOAT64}); // FLOAT64 to align with pandas's behavior } + +TEST_F(TypeInference, String) +{ + auto const stream = rmm::cuda_stream_default; + auto options = inference_options{}; + + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = "[\"1990\",\"8\",\"25\"]"; + rmm::device_uvector d_data{data.size(), stream}; + cudaMemcpyAsync( + d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + + std::size_t constexpr size = 3; + auto const string_offset = std::vector{1, 8, 12}; + auto const string_length = std::vector{6, 3, 4}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); + + cudf::size_type constexpr num_omitted_nulls = 0; + auto res_type = + detect_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + + EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING}); +} + +TEST_F(TypeInference, Bool) +{ + auto const stream = rmm::cuda_stream_default; + auto options = inference_options{}; + + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = "[true,false,false]"; + rmm::device_uvector d_data{data.size(), stream}; + cudaMemcpyAsync( + d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + + std::size_t constexpr size = 3; + auto const string_offset = std::vector{1, 6, 12}; + auto const string_length = std::vector{4, 5, 5}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); + + cudf::size_type constexpr num_omitted_nulls = 0; + auto res_type = + detect_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + + EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::BOOL8}); +} + +TEST_F(TypeInference, Timestamp) +{ + auto const stream = rmm::cuda_stream_default; + auto options = inference_options{}; + + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = "[1970/2/5,1970/8/25]"; + rmm::device_uvector d_data{data.size(), stream}; + cudaMemcpyAsync( + d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + + std::size_t constexpr size = 3; + auto const string_offset = std::vector{1, 10}; + auto const string_length = std::vector{8, 9}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); + + cudf::size_type constexpr num_omitted_nulls = 0; + auto res_type = + detect_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + + EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); +} From 9c112eefae97eb04d8d070d3e128a2387382b3f5 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 7 Sep 2022 02:37:54 -0700 Subject: [PATCH 128/173] schema meta data for non-string leaf columns --- cpp/src/io/json/nested_json_gpu.cu | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 7976ba2f1ac..41230d65570 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1517,7 +1517,14 @@ std::pair, std::vector> json_column_to // Reset nullable if we do not have nulls if (col->null_count() == 0) { col->set_null_mask({}); } - return {std::move(col), {{"offsets"}, {"chars"}}}; + // For string columns return ["offsets", "char"] schema + if (target_type.id() == type_id::STRING) { + return {std::move(col), {{"offsets"}, {"chars"}}}; + } + // Non-string columns do not have child columns in the schema + else { + return {std::move(col), {}}; + } break; } case json_col_t::StructColumn: { From e3bb216df3cd35d9f224c851ba2e9641d3a53fe7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 7 Sep 2022 09:53:30 -0400 Subject: [PATCH 129/173] Cleanups: renaming, const parameter + namespace --- cpp/src/io/utilities/type_inference.cuh | 60 ++++++++++++------------- cpp/tests/io/type_inference_test.cu | 12 ++--- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 899a51e55d2..7ba15b6c14f 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -31,9 +31,7 @@ #include -namespace cudf { -namespace io { -namespace detail { +namespace cudf::io::detail { /** * @brief Non-owning view for type inference options */ @@ -43,6 +41,7 @@ struct inference_options_view { cudf::detail::trie_view trie_na; char quote_char; }; + /** * @brief Structure for type inference options */ @@ -125,11 +124,11 @@ __device__ __inline__ bool is_like_float(std::size_t len, * @param[out] column_info Histogram of column type counters */ template -__global__ void detect_column_type_kernel(inference_options_view const options, - device_span const data, - ColumnStringIter column_strings_begin, - std::size_t const size, - cudf::io::column_type_histogram* column_info) +__global__ void infer_column_type_kernel(inference_options_view options, + device_span data, + ColumnStringIter column_strings_begin, + std::size_t size, + cudf::io::column_type_histogram* column_info) { for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size; idx += gridDim.x * blockDim.x) { @@ -143,8 +142,11 @@ __global__ void detect_column_type_kernel(inference_options_view const options, continue; } + if (field_len == 0) { + atomicAdd(&column_info->null_count, 1); + continue; + } // Handling strings - if (field_len == 0) continue; if (*field_begin == options.quote_char && field_begin[field_len - 1] == options.quote_char) { atomicAdd(&column_info->string_count, 1); continue; @@ -196,7 +198,7 @@ __global__ void detect_column_type_kernel(inference_options_view const options, options.trie_false, {field_begin, static_cast(field_len)})) { atomicAdd(&column_info->bool_count, 1); } else if (digit_count == int_req_number_cnt) { - bool is_negative = (*field_begin == '-'); + auto const is_negative = (*field_begin == '-'); char const* data_begin = field_begin + (is_negative || (*field_begin == '+')); cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( data_begin, data_begin + digit_count, is_negative, *column_info); @@ -231,11 +233,11 @@ __global__ void detect_column_type_kernel(inference_options_view const options, * @return A histogram containing column-specific type counters */ template -cudf::io::column_type_histogram detect_column_type(inference_options_view const& options, - cudf::device_span data, - ColumnStringIter column_strings_begin, - std::size_t const size, - rmm::cuda_stream_view stream) +cudf::io::column_type_histogram infer_column_type(inference_options_view const& options, + cudf::device_span data, + ColumnStringIter column_strings_begin, + std::size_t const size, + rmm::cuda_stream_view stream) { constexpr int block_size = 128; @@ -244,17 +246,17 @@ cudf::io::column_type_histogram detect_column_type(inference_options_view const& CUDF_CUDA_TRY(cudaMemsetAsync( d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value())); - detect_column_type_kernel<<>>( + infer_column_type_kernel<<>>( options, data, column_strings_begin, size, d_column_info.data()); return d_column_info.value(stream); } /** - * @brief Detects data type for a given JSON string input `data`. + * @brief Infers data type for a given JSON string input `data`. * * @throw cudf::logic_error if input size is 0 - * @throw cudf::logic_error if data type detection failed + * @throw cudf::logic_error if data type inference failed * * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to * `thrust::tuple` @@ -265,19 +267,19 @@ cudf::io::column_type_histogram detect_column_type(inference_options_view const& * @param omission_null_count Number of omitted nulls * @param size Size of the string input * @param stream CUDA stream used for device memory operations and kernel launches - * @return The detected data type + * @return The inferred data type */ template -cudf::data_type detect_data_type(inference_options_view const& options, - device_span data, - ColumnStringIter column_strings_begin, - cudf::size_type omission_null_count, - std::size_t const size, - rmm::cuda_stream_view stream) +cudf::data_type infer_data_type(inference_options_view const& options, + device_span data, + ColumnStringIter column_strings_begin, + cudf::size_type omission_null_count, + std::size_t const size, + rmm::cuda_stream_view stream) { CUDF_EXPECTS(size != 0, "No data available for data type inference.\n"); - auto const h_column_info = detect_column_type(options, data, column_strings_begin, size, stream); + auto const h_column_info = infer_column_type(options, data, column_strings_begin, size, stream); auto get_type_id = [&](auto const& cinfo) { auto int_count_total = @@ -301,11 +303,9 @@ cudf::data_type detect_data_type(inference_options_view const& options, } else if (cinfo.bool_count > 0) { return type_id::BOOL8; } else { - CUDF_FAIL("Data type detection failed.\n"); + CUDF_FAIL("Data type inference failed.\n"); } }; return cudf::data_type{get_type_id(h_column_info)}; } -} // namespace detail -} // namespace io -} // namespace cudf +} // namespace cudf::io::detail diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index 5fa4b359d06..c220a481688 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -29,7 +29,7 @@ #include #include -using cudf::io::detail::detect_data_type; +using cudf::io::detail::infer_data_type; using cudf::io::detail::inference_options; // Base test fixture for tests @@ -61,7 +61,7 @@ TEST_F(TypeInference, Basic) cudf::size_type constexpr num_omitted_nulls = 0; auto res_type = - detect_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + infer_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT64}); } @@ -91,7 +91,7 @@ TEST_F(TypeInference, OmittedNull) cudf::size_type constexpr num_omitted_nulls = 1; auto res_type = - detect_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + infer_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::FLOAT64}); // FLOAT64 to align with pandas's behavior @@ -122,7 +122,7 @@ TEST_F(TypeInference, String) cudf::size_type constexpr num_omitted_nulls = 0; auto res_type = - detect_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + infer_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING}); } @@ -152,7 +152,7 @@ TEST_F(TypeInference, Bool) cudf::size_type constexpr num_omitted_nulls = 0; auto res_type = - detect_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + infer_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::BOOL8}); } @@ -182,7 +182,7 @@ TEST_F(TypeInference, Timestamp) cudf::size_type constexpr num_omitted_nulls = 0; auto res_type = - detect_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + infer_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); } From b71f665c6f033167d039027312ee9a1d4fae1884 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 7 Sep 2022 10:06:18 -0400 Subject: [PATCH 130/173] Code cleanup --- cpp/src/io/utilities/type_inference.cuh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 7ba15b6c14f..e23a57f981c 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -186,12 +186,11 @@ __global__ void infer_column_type_kernel(inference_options_view options, } } - // Integers have to have the length of the string - auto int_req_number_cnt = static_cast(field_len); - // Off by one if they start with a minus sign - if ((*field_begin == '-' || *field_begin == '+') && field_len > 1) { --int_req_number_cnt; } - // Off by one if they are a hexadecimal number - if (maybe_hex) { --int_req_number_cnt; } + // All characters must be digits in an integer, except for the starting sign and 'x' in the + // hexadecimal prefix + auto const int_req_number_cnt = + static_cast(field_len) - + ((*field_begin == '-' || *field_begin == '+') && field_len > 1) - maybe_hex; if (cudf::detail::serialized_trie_contains( options.trie_true, {field_begin, static_cast(field_len)}) || cudf::detail::serialized_trie_contains( From 0a568002382808f3224aa31d03c518eb18e22327 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 7 Sep 2022 10:47:43 -0400 Subject: [PATCH 131/173] Cleanup: use parse_options consistently --- cpp/src/io/utilities/parsing_utils.cuh | 18 +++++++++++++ cpp/src/io/utilities/type_inference.cuh | 34 +++---------------------- cpp/tests/io/type_inference_test.cu | 22 ++++++++-------- 3 files changed, 32 insertions(+), 42 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index a3699acb934..87d83406aef 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -42,6 +42,16 @@ using cudf::device_span; namespace cudf { namespace io { +/** + * @brief Non-owning view for json type inference options + */ +struct json_parse_options_view { + char quote_char; + cudf::detail::trie_view trie_true; + cudf::detail::trie_view trie_false; + cudf::detail::trie_view trie_na; +}; + /** * @brief Structure for holding various options used when parsing and * converting CSV/json data to cuDF data type values. @@ -79,6 +89,14 @@ struct parse_options { cudf::detail::optional_trie trie_na; bool multi_delimiter; + [[nodiscard]] json_parse_options_view json_view() const + { + return {quotechar, + cudf::detail::make_trie_view(trie_true), + cudf::detail::make_trie_view(trie_false), + cudf::detail::make_trie_view(trie_na)}; + } + [[nodiscard]] parse_options_view view() const { return {delimiter, diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index e23a57f981c..16f0983e565 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -32,34 +32,6 @@ #include namespace cudf::io::detail { -/** - * @brief Non-owning view for type inference options - */ -struct inference_options_view { - cudf::detail::trie_view trie_true; - cudf::detail::trie_view trie_false; - cudf::detail::trie_view trie_na; - char quote_char; -}; - -/** - * @brief Structure for type inference options - */ -struct inference_options { - cudf::detail::optional_trie trie_true; - cudf::detail::optional_trie trie_false; - cudf::detail::optional_trie trie_na; - char quote_char = '"'; - - [[nodiscard]] inference_options_view view() const - { - return {cudf::detail::make_trie_view(trie_true), - cudf::detail::make_trie_view(trie_false), - cudf::detail::make_trie_view(trie_na), - quote_char}; - } -}; - /** * @brief Returns true if the input character is a valid digit. * Supports both decimal and hexadecimal digits (uppercase and lowercase). @@ -124,7 +96,7 @@ __device__ __inline__ bool is_like_float(std::size_t len, * @param[out] column_info Histogram of column type counters */ template -__global__ void infer_column_type_kernel(inference_options_view options, +__global__ void infer_column_type_kernel(json_parse_options_view options, device_span data, ColumnStringIter column_strings_begin, std::size_t size, @@ -232,7 +204,7 @@ __global__ void infer_column_type_kernel(inference_options_view options, * @return A histogram containing column-specific type counters */ template -cudf::io::column_type_histogram infer_column_type(inference_options_view const& options, +cudf::io::column_type_histogram infer_column_type(json_parse_options_view const& options, cudf::device_span data, ColumnStringIter column_strings_begin, std::size_t const size, @@ -269,7 +241,7 @@ cudf::io::column_type_histogram infer_column_type(inference_options_view const& * @return The inferred data type */ template -cudf::data_type infer_data_type(inference_options_view const& options, +cudf::data_type infer_data_type(json_parse_options_view const& options, device_span data, ColumnStringIter column_strings_begin, cudf::size_type omission_null_count, diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index c220a481688..1a5da84dd2d 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -29,8 +29,8 @@ #include #include +using cudf::io::parse_options; using cudf::io::detail::infer_data_type; -using cudf::io::detail::inference_options; // Base test fixture for tests struct TypeInference : public cudf::test::BaseFixture { @@ -39,8 +39,8 @@ struct TypeInference : public cudf::test::BaseFixture { TEST_F(TypeInference, Basic) { auto const stream = rmm::cuda_stream_default; - auto options = inference_options{}; + auto options = parse_options{',', '\n', '\"'}; options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); @@ -61,7 +61,7 @@ TEST_F(TypeInference, Basic) cudf::size_type constexpr num_omitted_nulls = 0; auto res_type = - infer_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + infer_data_type(options.json_view(), d_data, d_col_strings, num_omitted_nulls, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT64}); } @@ -69,8 +69,8 @@ TEST_F(TypeInference, Basic) TEST_F(TypeInference, OmittedNull) { auto const stream = rmm::cuda_stream_default; - auto options = inference_options{}; + auto options = parse_options{',', '\n', '\"'}; options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); @@ -91,7 +91,7 @@ TEST_F(TypeInference, OmittedNull) cudf::size_type constexpr num_omitted_nulls = 1; auto res_type = - infer_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + infer_data_type(options.json_view(), d_data, d_col_strings, num_omitted_nulls, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::FLOAT64}); // FLOAT64 to align with pandas's behavior @@ -100,8 +100,8 @@ TEST_F(TypeInference, OmittedNull) TEST_F(TypeInference, String) { auto const stream = rmm::cuda_stream_default; - auto options = inference_options{}; + auto options = parse_options{',', '\n', '\"'}; options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); @@ -122,7 +122,7 @@ TEST_F(TypeInference, String) cudf::size_type constexpr num_omitted_nulls = 0; auto res_type = - infer_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + infer_data_type(options.json_view(), d_data, d_col_strings, num_omitted_nulls, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING}); } @@ -130,8 +130,8 @@ TEST_F(TypeInference, String) TEST_F(TypeInference, Bool) { auto const stream = rmm::cuda_stream_default; - auto options = inference_options{}; + auto options = parse_options{',', '\n', '\"'}; options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); @@ -152,7 +152,7 @@ TEST_F(TypeInference, Bool) cudf::size_type constexpr num_omitted_nulls = 0; auto res_type = - infer_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + infer_data_type(options.json_view(), d_data, d_col_strings, num_omitted_nulls, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::BOOL8}); } @@ -160,8 +160,8 @@ TEST_F(TypeInference, Bool) TEST_F(TypeInference, Timestamp) { auto const stream = rmm::cuda_stream_default; - auto options = inference_options{}; + auto options = parse_options{',', '\n', '\"'}; options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); @@ -182,7 +182,7 @@ TEST_F(TypeInference, Timestamp) cudf::size_type constexpr num_omitted_nulls = 0; auto res_type = - infer_data_type(options.view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + infer_data_type(options.json_view(), d_data, d_col_strings, num_omitted_nulls, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); } From 717eacd01123059b84ddf92e015a1db1e8f710a1 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 7 Sep 2022 10:45:00 -0700 Subject: [PATCH 132/173] resolves upstream interface changes --- cpp/src/io/json/nested_json_gpu.cu | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 41230d65570..fa896a7a958 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1441,17 +1441,6 @@ auto casting_options(cudf::io::json_reader_options const& options) return parse_opts; } -auto inference_options(cudf::io::json_reader_options const& options) -{ - cudf::io::detail::inference_options parse_opts{}; - - auto const stream = cudf::default_stream_value; - parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); - parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); - parse_opts.trie_na = cudf::detail::create_serialized_trie({"null"}, stream); - return parse_opts; -} - std::pair, std::vector> json_column_to_cudf_column( json_column const& json_col, device_span d_input, @@ -1502,8 +1491,12 @@ std::pair, std::vector> json_column_to }); // Infer column type - auto target_type = cudf::io::detail::detect_data_type( - inference_options(options).view(), d_input, string_ranges_it, col_size, stream); + auto target_type = cudf::io::detail::infer_data_type(casting_options(options).json_view(), + d_input, + string_ranges_it, + (col_size - json_col.valid_count), + col_size, + stream); // Convert strings to the inferred data type auto col = cudf::io::json::experimental::detail::parse_data(string_spans_it, From 07b2fffffa555ee7944d93322a2c247eef9d5ba4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 9 Sep 2022 09:23:36 -0400 Subject: [PATCH 133/173] Update cpp/tests/io/type_inference_test.cu Co-authored-by: Tobias Ribizel --- cpp/tests/io/type_inference_test.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index 1a5da84dd2d..d2daea8ab21 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -106,7 +106,7 @@ TEST_F(TypeInference, String) options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - std::string data = "[\"1990\",\"8\",\"25\"]"; + std::string data = R"json(["1990","8","25"])json"; rmm::device_uvector d_data{data.size(), stream}; cudaMemcpyAsync( d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); From 5ecff99ff42c5c865e33b1aec962ca8328498b17 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 9 Sep 2022 09:39:17 -0400 Subject: [PATCH 134/173] Revert ommited null changes --- cpp/src/io/utilities/type_inference.cuh | 12 +++--------- cpp/tests/io/type_inference_test.cu | 22 ++++++---------------- 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 16f0983e565..1c5689d30a8 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -114,10 +114,7 @@ __global__ void infer_column_type_kernel(json_parse_options_view options, continue; } - if (field_len == 0) { - atomicAdd(&column_info->null_count, 1); - continue; - } + if (field_len == 0) { continue; } // Handling strings if (*field_begin == options.quote_char && field_begin[field_len - 1] == options.quote_char) { atomicAdd(&column_info->string_count, 1); @@ -235,7 +232,6 @@ cudf::io::column_type_histogram infer_column_type(json_parse_options_view const& * @param options View of inference options * @param data JSON string input * @param column_strings_begin The begining of an offset-length tuple sequence - * @param omission_null_count Number of omitted nulls * @param size Size of the string input * @param stream CUDA stream used for device memory operations and kernel launches * @return The inferred data type @@ -244,7 +240,6 @@ template cudf::data_type infer_data_type(json_parse_options_view const& options, device_span data, ColumnStringIter column_strings_begin, - cudf::size_type omission_null_count, std::size_t const size, rmm::cuda_stream_view stream) { @@ -255,15 +250,14 @@ cudf::data_type infer_data_type(json_parse_options_view const& options, auto get_type_id = [&](auto const& cinfo) { auto int_count_total = cinfo.big_int_count + cinfo.negative_small_int_count + cinfo.positive_small_int_count; - if ((cinfo.null_count + omission_null_count) == static_cast(size)) { + if (cinfo.null_count == static_cast(size)) { // Entire column is NULL; allocate the smallest amount of memory return type_id::INT8; } else if (cinfo.string_count > 0) { return type_id::STRING; } else if (cinfo.datetime_count > 0) { return type_id::TIMESTAMP_MILLISECONDS; - } else if (cinfo.float_count > 0 || - (int_count_total > 0 && (cinfo.null_count + omission_null_count) > 0)) { + } else if (cinfo.float_count > 0 || (int_count_total > 0 && cinfo.null_count > 0)) { return type_id::FLOAT64; } else if (cinfo.big_int_count == 0 && int_count_total != 0) { return type_id::INT64; diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index 1a5da84dd2d..a44314705cd 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -59,14 +59,12 @@ TEST_F(TypeInference, Basic) auto d_col_strings = thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - cudf::size_type constexpr num_omitted_nulls = 0; - auto res_type = - infer_data_type(options.json_view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT64}); } -TEST_F(TypeInference, OmittedNull) +TEST_F(TypeInference, Null) { auto const stream = rmm::cuda_stream_default; @@ -89,9 +87,7 @@ TEST_F(TypeInference, OmittedNull) auto d_col_strings = thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - cudf::size_type constexpr num_omitted_nulls = 1; - auto res_type = - infer_data_type(options.json_view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::FLOAT64}); // FLOAT64 to align with pandas's behavior @@ -120,9 +116,7 @@ TEST_F(TypeInference, String) auto d_col_strings = thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - cudf::size_type constexpr num_omitted_nulls = 0; - auto res_type = - infer_data_type(options.json_view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING}); } @@ -150,9 +144,7 @@ TEST_F(TypeInference, Bool) auto d_col_strings = thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - cudf::size_type constexpr num_omitted_nulls = 0; - auto res_type = - infer_data_type(options.json_view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::BOOL8}); } @@ -180,9 +172,7 @@ TEST_F(TypeInference, Timestamp) auto d_col_strings = thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - cudf::size_type constexpr num_omitted_nulls = 0; - auto res_type = - infer_data_type(options.json_view(), d_data, d_col_strings, num_omitted_nulls, size, stream); + auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); } From ee588dd8f5b6b3e5251a23d5e5abc79cf0c01694 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 9 Sep 2022 09:43:46 -0400 Subject: [PATCH 135/173] Minor cleanups --- cpp/tests/io/type_inference_test.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index 9cca9bbd0c9..da336c2d375 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -45,7 +45,7 @@ TEST_F(TypeInference, Basic) options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - std::string data = "[42,52,5]"; + std::string data = R"json([42,52,5])json"; rmm::device_uvector d_data{data.size(), stream}; cudaMemcpyAsync( d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); @@ -73,7 +73,7 @@ TEST_F(TypeInference, Null) options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - std::string data = "[52,5]"; + std::string data = R"json([52,5])json"; rmm::device_uvector d_data{data.size(), stream}; cudaMemcpyAsync( d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); @@ -130,7 +130,7 @@ TEST_F(TypeInference, Bool) options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - std::string data = "[true,false,false]"; + std::string data = R"json([true,false,false])json"; rmm::device_uvector d_data{data.size(), stream}; cudaMemcpyAsync( d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); @@ -158,7 +158,7 @@ TEST_F(TypeInference, Timestamp) options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - std::string data = "[1970/2/5,1970/8/25]"; + std::string data = R"json([1970/2/5,1970/8/25])json"; rmm::device_uvector d_data{data.size(), stream}; cudaMemcpyAsync( d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); From be717a7f8dc22eed526855540944025bfe6ade08 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 9 Sep 2022 10:00:32 -0400 Subject: [PATCH 136/173] Add all null test --- cpp/src/io/utilities/type_inference.cuh | 1 - cpp/tests/io/type_inference_test.cu | 28 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 1c5689d30a8..b6109d4d806 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -114,7 +114,6 @@ __global__ void infer_column_type_kernel(json_parse_options_view options, continue; } - if (field_len == 0) { continue; } // Handling strings if (*field_begin == options.quote_char && field_begin[field_len - 1] == options.quote_char) { atomicAdd(&column_info->string_count, 1); diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index da336c2d375..7a29985ddff 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -93,6 +93,34 @@ TEST_F(TypeInference, Null) cudf::data_type{cudf::type_id::FLOAT64}); // FLOAT64 to align with pandas's behavior } +TEST_F(TypeInference, AllNull) +{ + auto const stream = rmm::cuda_stream_default; + + auto options = parse_options{',', '\n', '\"'}; + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = R"json([null])json"; + rmm::device_uvector d_data{data.size(), stream}; + cudaMemcpyAsync( + d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + + std::size_t constexpr size = 3; + auto const string_offset = std::vector{1, 1, 1}; + auto const string_length = std::vector{0, 0, 4}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); + + auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); + + EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT8}); // INT8 if all nulls +} + TEST_F(TypeInference, String) { auto const stream = rmm::cuda_stream_default; From 168efddcae6b6a3f49ee7a900d2e00c4a34e2936 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 9 Sep 2022 10:09:38 -0400 Subject: [PATCH 137/173] Renaming json inference options view struct --- cpp/src/io/utilities/parsing_utils.cuh | 4 ++-- cpp/src/io/utilities/type_inference.cuh | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 87d83406aef..118fde6fdb6 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -45,7 +45,7 @@ namespace io { /** * @brief Non-owning view for json type inference options */ -struct json_parse_options_view { +struct json_inference_options_view { char quote_char; cudf::detail::trie_view trie_true; cudf::detail::trie_view trie_false; @@ -89,7 +89,7 @@ struct parse_options { cudf::detail::optional_trie trie_na; bool multi_delimiter; - [[nodiscard]] json_parse_options_view json_view() const + [[nodiscard]] json_inference_options_view json_view() const { return {quotechar, cudf::detail::make_trie_view(trie_true), diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index b6109d4d806..077737960cd 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -96,7 +96,7 @@ __device__ __inline__ bool is_like_float(std::size_t len, * @param[out] column_info Histogram of column type counters */ template -__global__ void infer_column_type_kernel(json_parse_options_view options, +__global__ void infer_column_type_kernel(json_inference_options_view options, device_span data, ColumnStringIter column_strings_begin, std::size_t size, @@ -200,7 +200,7 @@ __global__ void infer_column_type_kernel(json_parse_options_view options, * @return A histogram containing column-specific type counters */ template -cudf::io::column_type_histogram infer_column_type(json_parse_options_view const& options, +cudf::io::column_type_histogram infer_column_type(json_inference_options_view const& options, cudf::device_span data, ColumnStringIter column_strings_begin, std::size_t const size, @@ -236,7 +236,7 @@ cudf::io::column_type_histogram infer_column_type(json_parse_options_view const& * @return The inferred data type */ template -cudf::data_type infer_data_type(json_parse_options_view const& options, +cudf::data_type infer_data_type(json_inference_options_view const& options, device_span data, ColumnStringIter column_strings_begin, std::size_t const size, From eb930a25d9a4be6127877d4faacf0a223e29773d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 9 Sep 2022 10:42:28 -0400 Subject: [PATCH 138/173] Minor improvement: use block reduce to minimize global atomic --- cpp/src/io/utilities/type_inference.cuh | 38 ++++++++++++++++++++----- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 077737960cd..2bab166c9f5 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -29,6 +29,8 @@ #include #include +#include + #include namespace cudf::io::detail { @@ -85,6 +87,7 @@ __device__ __inline__ bool is_like_float(std::size_t len, /** * @brief Constructs column type histogram for a given column string input `data`. * + * @tparam BlockSize Number of threads in each block * @tparam ColumnStringIter Iterator type whose `value_type` is a * `thrust::tuple`, where `offset_t` and `length_t` are of integral type and * `offset_t` needs to be convertible to `std::size_t`. @@ -95,13 +98,19 @@ __device__ __inline__ bool is_like_float(std::size_t len, * @param[in] size Size of the string input * @param[out] column_info Histogram of column type counters */ -template +template __global__ void infer_column_type_kernel(json_inference_options_view options, device_span data, ColumnStringIter column_strings_begin, std::size_t size, cudf::io::column_type_histogram* column_info) { + cudf::size_type null_count = 0; + cudf::size_type string_count = 0; + cudf::size_type bool_count = 0; + cudf::size_type float_count = 0; + cudf::size_type datetime_count = 0; + for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size; idx += gridDim.x * blockDim.x) { auto const field_offset = thrust::get<0>(*(column_strings_begin + idx)); @@ -110,13 +119,13 @@ __global__ void infer_column_type_kernel(json_inference_options_view options, if (cudf::detail::serialized_trie_contains( options.trie_na, {field_begin, static_cast(field_len)})) { - atomicAdd(&column_info->null_count, 1); + ++null_count; continue; } // Handling strings if (*field_begin == options.quote_char && field_begin[field_len - 1] == options.quote_char) { - atomicAdd(&column_info->string_count, 1); + ++string_count; continue; } @@ -163,7 +172,7 @@ __global__ void infer_column_type_kernel(json_inference_options_view options, options.trie_true, {field_begin, static_cast(field_len)}) || cudf::detail::serialized_trie_contains( options.trie_false, {field_begin, static_cast(field_len)})) { - atomicAdd(&column_info->bool_count, 1); + ++bool_count; } else if (digit_count == int_req_number_cnt) { auto const is_negative = (*field_begin == '-'); char const* data_begin = field_begin + (is_negative || (*field_begin == '+')); @@ -172,7 +181,7 @@ __global__ void infer_column_type_kernel(json_inference_options_view options, atomicAdd(ptr, 1); } else if (is_like_float( field_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) { - atomicAdd(&column_info->float_count, 1); + ++float_count; } // A date field can have either one or two '-' or '\'; A legal combination will only have one // of them To simplify the process of auto column detection, we are not covering all the @@ -180,9 +189,24 @@ __global__ void infer_column_type_kernel(json_inference_options_view options, else if (((dash_count > 0 && dash_count <= 2 && slash_count == 0) || (dash_count == 0 && slash_count > 0 && slash_count <= 2)) && colon_count <= 2) { - atomicAdd(&column_info->datetime_count, 1); + ++datetime_count; } } // grid-stride for loop + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + auto block_null_count = BlockReduce(temp_storage).Sum(null_count); + auto block_string_count = BlockReduce(temp_storage).Sum(string_count); + auto block_bool_count = BlockReduce(temp_storage).Sum(bool_count); + auto block_float_count = BlockReduce(temp_storage).Sum(float_count); + auto block_datetime_count = BlockReduce(temp_storage).Sum(datetime_count); + if (threadIdx.x == 0) { + atomicAdd(&column_info->null_count, block_null_count); + atomicAdd(&column_info->string_count, block_string_count); + atomicAdd(&column_info->bool_count, block_bool_count); + atomicAdd(&column_info->float_count, block_float_count); + atomicAdd(&column_info->datetime_count, block_datetime_count); + } } /** @@ -213,7 +237,7 @@ cudf::io::column_type_histogram infer_column_type(json_inference_options_view co CUDF_CUDA_TRY(cudaMemsetAsync( d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value())); - infer_column_type_kernel<<>>( + infer_column_type_kernel<<>>( options, data, column_strings_begin, size, d_column_info.data()); return d_column_info.value(stream); From 6e5062f24591870925032a9b14e9a678e6ceb9a5 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 9 Sep 2022 10:47:57 -0400 Subject: [PATCH 139/173] Minor cleanup --- cpp/src/io/utilities/type_inference.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 2bab166c9f5..ffc7789db3d 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -290,9 +290,8 @@ cudf::data_type infer_data_type(json_inference_options_view const& options, return type_id::UINT64; } else if (cinfo.bool_count > 0) { return type_id::BOOL8; - } else { - CUDF_FAIL("Data type inference failed.\n"); } + CUDF_FAIL("Data type inference failed.\n"); }; return cudf::data_type{get_type_id(h_column_info)}; } From 223cd379c2d279109742e69b5633a26af7f0b83c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 9 Sep 2022 11:18:31 -0400 Subject: [PATCH 140/173] Update cpp/src/io/utilities/type_inference.cuh Co-authored-by: Tobias Ribizel --- cpp/src/io/utilities/type_inference.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index ffc7789db3d..78d4ac434e4 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -138,9 +138,9 @@ __global__ void infer_column_type_kernel(json_inference_options_view options, uint32_t exponent_count = 0; uint32_t other_count = 0; - auto const maybe_hex = (field_len > 2 && *field_begin == '0' && *(field_begin + 1) == 'x') || - (field_len > 3 && *field_begin == '-' && *(field_begin + 1) == '0' && - *(field_begin + 2) == 'x'); + auto const maybe_hex = (field_len > 2 && field_begin[0] == '0' && field_begin[1] == 'x') || + (field_len > 3 && field_begin[0] == '-' && field_begin[1] == '0' && + field_begin[2] == 'x'); auto const field_end = field_begin + field_len; for (auto pos = field_begin; pos < field_end; ++pos) { From 13fb10d6bb3255a405ed4ddf2c85382aa96cf0b2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 9 Sep 2022 11:42:03 -0400 Subject: [PATCH 141/173] Code formatting --- cpp/src/io/utilities/type_inference.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 78d4ac434e4..ecd04f19e60 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -138,9 +138,9 @@ __global__ void infer_column_type_kernel(json_inference_options_view options, uint32_t exponent_count = 0; uint32_t other_count = 0; - auto const maybe_hex = (field_len > 2 && field_begin[0] == '0' && field_begin[1] == 'x') || - (field_len > 3 && field_begin[0] == '-' && field_begin[1] == '0' && - field_begin[2] == 'x'); + auto const maybe_hex = + (field_len > 2 && field_begin[0] == '0' && field_begin[1] == 'x') || + (field_len > 3 && field_begin[0] == '-' && field_begin[1] == '0' && field_begin[2] == 'x'); auto const field_end = field_begin + field_len; for (auto pos = field_begin; pos < field_end; ++pos) { From 52e79af1810280a2ff90a80d66d058f402d8f1ea Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 12 Sep 2022 06:31:02 -0700 Subject: [PATCH 142/173] omissions in inference as zero-length strings --- cpp/src/io/json/nested_json_gpu.cu | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index fa896a7a958..86e475b2cc6 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1491,12 +1491,8 @@ std::pair, std::vector> json_column_to }); // Infer column type - auto target_type = cudf::io::detail::infer_data_type(casting_options(options).json_view(), - d_input, - string_ranges_it, - (col_size - json_col.valid_count), - col_size, - stream); + auto target_type = cudf::io::detail::infer_data_type( + casting_options(options).json_view(), d_input, string_ranges_it, col_size, stream); // Convert strings to the inferred data type auto col = cudf::io::json::experimental::detail::parse_data(string_spans_it, From ee65f284a37d86f2da7b916331b2853feba3a4e7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 12 Sep 2022 15:34:23 -0400 Subject: [PATCH 143/173] Treat all date-like input as string --- cpp/src/io/utilities/type_inference.cuh | 47 ++++++++++++------------- cpp/tests/io/type_inference_test.cu | 3 +- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index ecd04f19e60..e5ac2efd99b 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -88,6 +88,7 @@ __device__ __inline__ bool is_like_float(std::size_t len, * @brief Constructs column type histogram for a given column string input `data`. * * @tparam BlockSize Number of threads in each block + * @tparam OptionsView Type of inference options view * @tparam ColumnStringIter Iterator type whose `value_type` is a * `thrust::tuple`, where `offset_t` and `length_t` are of integral type and * `offset_t` needs to be convertible to `std::size_t`. @@ -98,18 +99,17 @@ __device__ __inline__ bool is_like_float(std::size_t len, * @param[in] size Size of the string input * @param[out] column_info Histogram of column type counters */ -template -__global__ void infer_column_type_kernel(json_inference_options_view options, +template +__global__ void infer_column_type_kernel(OptionsView options, device_span data, ColumnStringIter column_strings_begin, std::size_t size, cudf::io::column_type_histogram* column_info) { - cudf::size_type null_count = 0; - cudf::size_type string_count = 0; - cudf::size_type bool_count = 0; - cudf::size_type float_count = 0; - cudf::size_type datetime_count = 0; + cudf::size_type null_count = 0; + cudf::size_type string_count = 0; + cudf::size_type bool_count = 0; + cudf::size_type float_count = 0; for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size; idx += gridDim.x * blockDim.x) { @@ -183,35 +183,30 @@ __global__ void infer_column_type_kernel(json_inference_options_view options, field_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) { ++float_count; } - // A date field can have either one or two '-' or '\'; A legal combination will only have one - // of them To simplify the process of auto column detection, we are not covering all the - // date-time formation permutations - else if (((dash_count > 0 && dash_count <= 2 && slash_count == 0) || - (dash_count == 0 && slash_count > 0 && slash_count <= 2)) && - colon_count <= 2) { - ++datetime_count; + // All invalid JSON values are treated as string + else { + ++string_count; } } // grid-stride for loop using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - auto block_null_count = BlockReduce(temp_storage).Sum(null_count); - auto block_string_count = BlockReduce(temp_storage).Sum(string_count); - auto block_bool_count = BlockReduce(temp_storage).Sum(bool_count); - auto block_float_count = BlockReduce(temp_storage).Sum(float_count); - auto block_datetime_count = BlockReduce(temp_storage).Sum(datetime_count); + auto block_null_count = BlockReduce(temp_storage).Sum(null_count); + auto block_string_count = BlockReduce(temp_storage).Sum(string_count); + auto block_bool_count = BlockReduce(temp_storage).Sum(bool_count); + auto block_float_count = BlockReduce(temp_storage).Sum(float_count); if (threadIdx.x == 0) { atomicAdd(&column_info->null_count, block_null_count); atomicAdd(&column_info->string_count, block_string_count); atomicAdd(&column_info->bool_count, block_bool_count); atomicAdd(&column_info->float_count, block_float_count); - atomicAdd(&column_info->datetime_count, block_datetime_count); } } /** * @brief Constructs column type histogram for a given column string input `data`. * + * @tparam OptionsView Type of inference options view * @tparam ColumnStringIter Iterator type whose `value_type` is a * `thrust::tuple`, where `offset_t` and `length_t` are of integral type and * `offset_t` needs to be convertible to `std::size_t`. @@ -223,8 +218,8 @@ __global__ void infer_column_type_kernel(json_inference_options_view options, * @param stream CUDA stream used for device memory operations and kernel launches * @return A histogram containing column-specific type counters */ -template -cudf::io::column_type_histogram infer_column_type(json_inference_options_view const& options, +template +cudf::io::column_type_histogram infer_column_type(OptionsView const& options, cudf::device_span data, ColumnStringIter column_strings_begin, std::size_t const size, @@ -247,8 +242,10 @@ cudf::io::column_type_histogram infer_column_type(json_inference_options_view co * @brief Infers data type for a given JSON string input `data`. * * @throw cudf::logic_error if input size is 0 + * @throw cudf::logic_error if date time is not inferred as string * @throw cudf::logic_error if data type inference failed * + * @tparam OptionsView Type of inference options view * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to * `thrust::tuple` * @@ -259,8 +256,8 @@ cudf::io::column_type_histogram infer_column_type(json_inference_options_view co * @param stream CUDA stream used for device memory operations and kernel launches * @return The inferred data type */ -template -cudf::data_type infer_data_type(json_inference_options_view const& options, +template +cudf::data_type infer_data_type(OptionsView const& options, device_span data, ColumnStringIter column_strings_begin, std::size_t const size, @@ -279,7 +276,7 @@ cudf::data_type infer_data_type(json_inference_options_view const& options, } else if (cinfo.string_count > 0) { return type_id::STRING; } else if (cinfo.datetime_count > 0) { - return type_id::TIMESTAMP_MILLISECONDS; + CUDF_FAIL("Date time is inferred as string.\n"); } else if (cinfo.float_count > 0 || (int_count_total > 0 && cinfo.null_count > 0)) { return type_id::FLOAT64; } else if (cinfo.big_int_count == 0 && int_count_total != 0) { diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index 7a29985ddff..9b9e10af73d 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -202,5 +202,6 @@ TEST_F(TypeInference, Timestamp) auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); - EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); + // All data time (quoted and unquoted) is inferred as string for now + EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING}); } From 0a7bdfa4cd96eb599998635eac0f2cfb0735f054 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 12 Sep 2022 15:59:36 -0400 Subject: [PATCH 144/173] Add invalid input test --- cpp/tests/io/type_inference_test.cu | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index 9b9e10af73d..447725eb9b1 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -205,3 +205,32 @@ TEST_F(TypeInference, Timestamp) // All data time (quoted and unquoted) is inferred as string for now EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING}); } + +TEST_F(TypeInference, InvalidInput) +{ + auto const stream = rmm::cuda_stream_default; + + auto options = parse_options{',', '\n', '\"'}; + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = R"json([1,2,3,a,5])json"; + rmm::device_uvector d_data{data.size(), stream}; + cudaMemcpyAsync( + d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + + std::size_t constexpr size = 5; + auto const string_offset = std::vector{1, 3, 5, 7, 9}; + auto const string_length = std::vector{1, 1, 1, 1, 1}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); + + auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); + + // Invalid input is inferred as string for now + EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING}); +} From 0ba1e260e34b6ec37ecdbff2041c86a9ce62e500 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 14 Sep 2022 01:01:45 -0700 Subject: [PATCH 145/173] adds explicit check for mix of struct and list values in same col --- cpp/src/io/json/nested_json.hpp | 9 ++++++++- cpp/src/io/json/nested_json_gpu.cu | 2 +- cpp/tests/io/nested_json_test.cpp | 25 +++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 673d0370667..5479fbde2e8 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -204,7 +205,7 @@ struct json_column { type = row_type; } // If, at some point within a column, we encounter a nested type (list or struct), - // we change that columns type to that respective nested type and invalidate all previous rows + // we change that column's type to that respective nested type and invalidate all previous rows else if (type == json_col_t::StringColumn && (row_type == json_col_t::ListColumn || row_type == json_col_t::StructColumn)) { // Change the column type @@ -215,6 +216,12 @@ struct json_column { std::fill_n(validity.begin(), validity.size(), 0); valid_count = 0U; } + // If this is a nested column but we're trying to insert either (a) a list node into a struct + // column or (b) a struct node into a list column, we fail + else if ((type == json_col_t::ListColumn && row_type == json_col_t::StructColumn) || + (type == json_col_t::StructColumn && row_type == json_col_t::ListColumn)) { + CUDF_FAIL("A mix of lists and structs within the same column is not supported"); + } // We shouldn't run into this, as we shouldn't be asked to append an "unknown" row type // CUDF_EXPECTS(type != json_col_t::Unknown, "Encountered invalid JSON token sequence"); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 86e475b2cc6..97f3b33e242 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1042,7 +1042,7 @@ std::pair, rmm::device_uvector> ge * @param[in] input The JSON input in host memory * @param[in] d_input The JSON input in device memory * @param[in] options Parsing options specifying the parsing behaviour - * @param[in] include_quote_char Whether to include the original quote chars for string values, + * @param[in] include_quote_char Whether to include the original quote chars around string values, * allowing to distinguish string values from numeric and literal values * @param[in] stream The CUDA stream to which kernels are dispatched * @param[in] mr Optional, resource with which to allocate diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 5f311610add..fcc288e60b4 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -424,3 +424,28 @@ TEST_F(JsonTest, ExtractColumnWithQuotes) cudf::column_view parsed_col2 = cudf_table.tbl->get_column(second_column_index); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2); } + +TEST_F(JsonTest, ExpectFailMixStructAndList) +{ + using cuio_json::SymbolT; + + // Prepare cuda stream for data transfers & kernels + constexpr auto stream = cudf::default_stream_value; + + // Default parsing options + cudf::io::json_reader_options options{}; + options.keep_quotes(true); + + std::vector const inputs{ + R"( [{"a":[123], "b":1.0}, {"b":1.1}, {"b":2.1, "a":{"0":123}}] )", + R"( [{"a":{"0":"foo"}, "b":1.0}, {"b":1.1}, {"b":2.1, "a":[123]}] )", + R"( [{"a":{"0":null}, "b":1.0}, {"b":1.1}, {"b":2.1, "a":[123]}] )"}; + + for (auto const& input : inputs) { + // Get the JSON's tree representation + CUDF_EXPECT_THROW_MESSAGE( + auto const cudf_table = cuio_json::detail::parse_nested_json( + cudf::host_span{input.data(), input.size()}, options, stream), + "A mix of lists and structs within the same column is not supported"); + } +} From 47f2c14b8aa153ce937a21b79b3ce6cbd49bd582 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 14 Sep 2022 01:13:14 -0700 Subject: [PATCH 146/173] renames keep_quotes option --- cpp/include/cudf/io/json.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index a557b6e0a24..aa7dca0dad3 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -211,7 +211,7 @@ class json_reader_options { * * @returns true if the experimental reader should keep quotes, false otherwise */ - bool is_keeping_quotes() const { return _keep_quotes; } + bool is_enabled_keep_quotes() const { return _keep_quotes; } /** * @brief Set data types for columns to be read. @@ -275,7 +275,7 @@ class json_reader_options { * @param val Boolean value to indicate whether the experimental reader should keep quotes * of string values */ - void keep_quotes(bool val) { _keep_quotes = val; } + void enable_keep_quotes(bool val) { _keep_quotes = val; } }; /** From 0c4854797a049fa1db24171cccbe41b7ae5ef775 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 14 Sep 2022 01:13:52 -0700 Subject: [PATCH 147/173] renames helper function for parse options --- cpp/src/io/json/nested_json_gpu.cu | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 97f3b33e242..32bbc5850da 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1429,12 +1429,18 @@ void make_json_column(json_column& root_column, root_column.level_child_cols_recursively(root_column.current_offset); } -auto casting_options(cudf::io::json_reader_options const& options) +/** + * @brief Retrieves the parse_options to be used for type inference and type casting + * + * @param options The reader options to influence the relevant type inference and type casting + * options + */ +auto parsing_options(cudf::io::json_reader_options const& options) { auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; auto const stream = cudf::default_stream_value; - parse_opts.keepquotes = options.is_keeping_quotes(); + parse_opts.keepquotes = options.is_enabled_keep_quotes(); parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); @@ -1492,14 +1498,14 @@ std::pair, std::vector> json_column_to // Infer column type auto target_type = cudf::io::detail::infer_data_type( - casting_options(options).json_view(), d_input, string_ranges_it, col_size, stream); + parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream); // Convert strings to the inferred data type auto col = cudf::io::json::experimental::detail::parse_data(string_spans_it, col_size, target_type, make_validity(json_col).first, - casting_options(options).view(), + parsing_options(options).view(), stream, mr); From 3839329d17924332cc4b75b55013597f2044a628 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 14 Sep 2022 01:21:55 -0700 Subject: [PATCH 148/173] fixes keep_quotes in tests --- cpp/tests/io/nested_json_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index fcc288e60b4..2ad51b9b7df 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -404,7 +404,7 @@ TEST_F(JsonTest, ExtractColumnWithQuotes) // Default parsing options cudf::io::json_reader_options options{}; - options.keep_quotes(true); + options.enable_keep_quotes(true); std::string const input = R"( [{"a":"0.0", "b":1.0}, {"b":1.1}, {"b":2.1, "a":"2.0"}] )"; // Get the JSON's tree representation @@ -434,7 +434,7 @@ TEST_F(JsonTest, ExpectFailMixStructAndList) // Default parsing options cudf::io::json_reader_options options{}; - options.keep_quotes(true); + options.enable_keep_quotes(true); std::vector const inputs{ R"( [{"a":[123], "b":1.0}, {"b":1.1}, {"b":2.1, "a":{"0":123}}] )", From fb9ac997aa7548d5da28d0595170c760e9c801da Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 14 Sep 2022 09:04:57 -0400 Subject: [PATCH 149/173] Reinforce string condition --- cpp/src/io/utilities/type_inference.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index e5ac2efd99b..3a3d04360ef 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -124,7 +124,8 @@ __global__ void infer_column_type_kernel(OptionsView options, } // Handling strings - if (*field_begin == options.quote_char && field_begin[field_len - 1] == options.quote_char) { + if (field_len >= 2 and *field_begin == options.quote_char and + field_begin[field_len - 1] == options.quote_char) { ++string_count; continue; } From c595ac0b57211c0ec8a7ef8ff8f88af8b7e75bb8 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 14 Sep 2022 09:41:28 -0400 Subject: [PATCH 150/173] Use per-thread histogram with custom sum reduction --- cpp/src/io/utilities/type_inference.cuh | 60 ++++++++++++++++--------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 3a3d04360ef..63ee731024c 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -34,6 +34,24 @@ #include namespace cudf::io::detail { +/** + * @brief Custom column_type_histogram sum reduction callable + */ +struct custom_sum { + __device__ inline cudf::io::column_type_histogram operator()( + cudf::io::column_type_histogram const& lhs, cudf::io::column_type_histogram const& rhs) + { + return {lhs.null_count + rhs.null_count, + lhs.float_count + rhs.float_count, + lhs.datetime_count + rhs.datetime_count, + lhs.string_count + rhs.string_count, + lhs.negative_small_int_count + rhs.negative_small_int_count, + lhs.positive_small_int_count + rhs.positive_small_int_count, + lhs.big_int_count + rhs.big_int_count, + lhs.bool_count + rhs.bool_count}; + } +}; + /** * @brief Returns true if the input character is a valid digit. * Supports both decimal and hexadecimal digits (uppercase and lowercase). @@ -106,10 +124,7 @@ __global__ void infer_column_type_kernel(OptionsView options, std::size_t size, cudf::io::column_type_histogram* column_info) { - cudf::size_type null_count = 0; - cudf::size_type string_count = 0; - cudf::size_type bool_count = 0; - cudf::size_type float_count = 0; + auto thread_type_histogram = cudf::io::column_type_histogram{}; for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size; idx += gridDim.x * blockDim.x) { @@ -119,14 +134,14 @@ __global__ void infer_column_type_kernel(OptionsView options, if (cudf::detail::serialized_trie_contains( options.trie_na, {field_begin, static_cast(field_len)})) { - ++null_count; + ++thread_type_histogram.null_count; continue; } // Handling strings if (field_len >= 2 and *field_begin == options.quote_char and field_begin[field_len - 1] == options.quote_char) { - ++string_count; + ++thread_type_histogram.string_count; continue; } @@ -173,34 +188,39 @@ __global__ void infer_column_type_kernel(OptionsView options, options.trie_true, {field_begin, static_cast(field_len)}) || cudf::detail::serialized_trie_contains( options.trie_false, {field_begin, static_cast(field_len)})) { - ++bool_count; + ++thread_type_histogram.bool_count; } else if (digit_count == int_req_number_cnt) { auto const is_negative = (*field_begin == '-'); char const* data_begin = field_begin + (is_negative || (*field_begin == '+')); cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( - data_begin, data_begin + digit_count, is_negative, *column_info); - atomicAdd(ptr, 1); + data_begin, data_begin + digit_count, is_negative, thread_type_histogram); + ++*ptr; } else if (is_like_float( field_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) { - ++float_count; + ++thread_type_histogram.float_count; } // All invalid JSON values are treated as string else { - ++string_count; + ++thread_type_histogram.string_count; } } // grid-stride for loop - using BlockReduce = cub::BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - auto block_null_count = BlockReduce(temp_storage).Sum(null_count); - auto block_string_count = BlockReduce(temp_storage).Sum(string_count); - auto block_bool_count = BlockReduce(temp_storage).Sum(bool_count); - auto block_float_count = BlockReduce(temp_storage).Sum(float_count); + auto const block_type_histogram = + BlockReduce(temp_storage).Reduce(thread_type_histogram, custom_sum{}); if (threadIdx.x == 0) { - atomicAdd(&column_info->null_count, block_null_count); - atomicAdd(&column_info->string_count, block_string_count); - atomicAdd(&column_info->bool_count, block_bool_count); - atomicAdd(&column_info->float_count, block_float_count); + atomicAdd(&column_info->null_count, block_type_histogram.null_count); + atomicAdd(&column_info->float_count, block_type_histogram.float_count); + // Date can be ignored for now since all dates are treated as strings + // atomicAdd(&column_info->datetime_count, block_type_histogram.datetime_count); + atomicAdd(&column_info->string_count, block_type_histogram.string_count); + atomicAdd(&column_info->negative_small_int_count, + block_type_histogram.negative_small_int_count); + atomicAdd(&column_info->positive_small_int_count, + block_type_histogram.positive_small_int_count); + atomicAdd(&column_info->big_int_count, block_type_histogram.big_int_count); + atomicAdd(&column_info->bool_count, block_type_histogram.bool_count); } } From 107c0cc09353d5583a85156418b20527d93bd716 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 14 Sep 2022 10:22:39 -0400 Subject: [PATCH 151/173] Use string scalar instead of char array --- cpp/tests/io/type_inference_test.cu | 99 +++++++++++++++++++---------- 1 file changed, 64 insertions(+), 35 deletions(-) diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index 447725eb9b1..8ba66b6369b 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -45,10 +46,9 @@ TEST_F(TypeInference, Basic) options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - std::string data = R"json([42,52,5])json"; - rmm::device_uvector d_data{data.size(), stream}; - cudaMemcpyAsync( - d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + std::string data = R"json([42,52,5])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); std::size_t constexpr size = 3; auto const string_offset = std::vector{1, 4, 7}; @@ -59,7 +59,12 @@ TEST_F(TypeInference, Basic) auto d_col_strings = thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT64}); } @@ -73,10 +78,9 @@ TEST_F(TypeInference, Null) options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - std::string data = R"json([52,5])json"; - rmm::device_uvector d_data{data.size(), stream}; - cudaMemcpyAsync( - d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + std::string data = R"json([52,5])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); std::size_t constexpr size = 3; auto const string_offset = std::vector{1, 1, 4}; @@ -87,7 +91,12 @@ TEST_F(TypeInference, Null) auto d_col_strings = thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::FLOAT64}); // FLOAT64 to align with pandas's behavior @@ -102,10 +111,9 @@ TEST_F(TypeInference, AllNull) options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - std::string data = R"json([null])json"; - rmm::device_uvector d_data{data.size(), stream}; - cudaMemcpyAsync( - d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + std::string data = R"json([null])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); std::size_t constexpr size = 3; auto const string_offset = std::vector{1, 1, 1}; @@ -116,7 +124,12 @@ TEST_F(TypeInference, AllNull) auto d_col_strings = thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT8}); // INT8 if all nulls } @@ -130,10 +143,9 @@ TEST_F(TypeInference, String) options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - std::string data = R"json(["1990","8","25"])json"; - rmm::device_uvector d_data{data.size(), stream}; - cudaMemcpyAsync( - d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + std::string data = R"json(["1990","8","25"])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); std::size_t constexpr size = 3; auto const string_offset = std::vector{1, 8, 12}; @@ -144,7 +156,12 @@ TEST_F(TypeInference, String) auto d_col_strings = thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING}); } @@ -158,10 +175,9 @@ TEST_F(TypeInference, Bool) options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - std::string data = R"json([true,false,false])json"; - rmm::device_uvector d_data{data.size(), stream}; - cudaMemcpyAsync( - d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + std::string data = R"json([true,false,false])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); std::size_t constexpr size = 3; auto const string_offset = std::vector{1, 6, 12}; @@ -172,7 +188,12 @@ TEST_F(TypeInference, Bool) auto d_col_strings = thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::BOOL8}); } @@ -186,10 +207,9 @@ TEST_F(TypeInference, Timestamp) options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - std::string data = R"json([1970/2/5,1970/8/25])json"; - rmm::device_uvector d_data{data.size(), stream}; - cudaMemcpyAsync( - d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + std::string data = R"json([1970/2/5,1970/8/25])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); std::size_t constexpr size = 3; auto const string_offset = std::vector{1, 10}; @@ -200,7 +220,12 @@ TEST_F(TypeInference, Timestamp) auto d_col_strings = thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); // All data time (quoted and unquoted) is inferred as string for now EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING}); @@ -215,10 +240,9 @@ TEST_F(TypeInference, InvalidInput) options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - std::string data = R"json([1,2,3,a,5])json"; - rmm::device_uvector d_data{data.size(), stream}; - cudaMemcpyAsync( - d_data.data(), data.data(), data.size() * sizeof(char), cudaMemcpyHostToDevice, stream.value()); + std::string data = R"json([1,2,3,a,5])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); std::size_t constexpr size = 5; auto const string_offset = std::vector{1, 3, 5, 7, 9}; @@ -229,7 +253,12 @@ TEST_F(TypeInference, InvalidInput) auto d_col_strings = thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); - auto res_type = infer_data_type(options.json_view(), d_data, d_col_strings, size, stream); + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); // Invalid input is inferred as string for now EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING}); From 80840b1f0f784136320a067b7bfd15de41f702fa Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 14 Sep 2022 10:40:20 -0400 Subject: [PATCH 152/173] Add default member initializer to column_type_histogram --- cpp/src/io/utilities/column_type_histogram.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/utilities/column_type_histogram.hpp b/cpp/src/io/utilities/column_type_histogram.hpp index 99762595693..8bd2d3a89cf 100644 --- a/cpp/src/io/utilities/column_type_histogram.hpp +++ b/cpp/src/io/utilities/column_type_histogram.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,14 +25,14 @@ namespace io { * @brief Per-column histogram struct containing detected occurrences of each dtype */ struct column_type_histogram { - cudf::size_type null_count; - cudf::size_type float_count; - cudf::size_type datetime_count; - cudf::size_type string_count; - cudf::size_type negative_small_int_count; - cudf::size_type positive_small_int_count; - cudf::size_type big_int_count; - cudf::size_type bool_count; + cudf::size_type null_count{}; + cudf::size_type float_count{}; + cudf::size_type datetime_count{}; + cudf::size_type string_count{}; + cudf::size_type negative_small_int_count{}; + cudf::size_type positive_small_int_count{}; + cudf::size_type big_int_count{}; + cudf::size_type bool_count{}; }; } // namespace io From 8deec5444a8df10b7f1d93174fb797d34ead4492 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 14 Sep 2022 10:43:47 -0400 Subject: [PATCH 153/173] Minor updates --- cpp/src/io/utilities/type_inference.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh index 63ee731024c..578c72fc316 100644 --- a/cpp/src/io/utilities/type_inference.cuh +++ b/cpp/src/io/utilities/type_inference.cuh @@ -212,8 +212,7 @@ __global__ void infer_column_type_kernel(OptionsView options, if (threadIdx.x == 0) { atomicAdd(&column_info->null_count, block_type_histogram.null_count); atomicAdd(&column_info->float_count, block_type_histogram.float_count); - // Date can be ignored for now since all dates are treated as strings - // atomicAdd(&column_info->datetime_count, block_type_histogram.datetime_count); + atomicAdd(&column_info->datetime_count, block_type_histogram.datetime_count); atomicAdd(&column_info->string_count, block_type_histogram.string_count); atomicAdd(&column_info->negative_small_int_count, block_type_histogram.negative_small_int_count); From b0be37b481c7f701c93dcf3c7ae2333dc15d68e4 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 14 Sep 2022 09:17:27 -0700 Subject: [PATCH 154/173] removes superfluous raw from string --- python/cudf/cudf/tests/test_json.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 13908a0ed45..2cb03c3a43c 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -622,8 +622,8 @@ def test_json_nested_lines(data): def test_json_nested_data(): json_str = ( - R'[{"0":{},"1":[],"2":{}},{"1":[[""],[]],"2":{"2":""}},' - R'{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}]' + '[{"0":{},"2":{}},{"1":[[""],[]],"2":{"2":""}},' + '{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}]' ) df = cudf.read_json( StringIO(json_str), engine="cudf_experimental", orient="records" From 1185b031325ee819cdd55781d4364f10a5b1d945 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 14 Sep 2022 09:41:48 -0700 Subject: [PATCH 155/173] fixes pytest by accounting for pyarrow misbehaving --- python/cudf/cudf/tests/test_json.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 2cb03c3a43c..4e9fa2f8deb 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -615,9 +615,13 @@ def test_json_nested_lines(data): ) bytes.seek(0) pdf = pd.read_json(bytes, orient="records", lines=True) - # In the second test-case: - # Pandas omits "f1" in first row, so we have to enforce a common schema - assert df.to_arrow().equals(pa.Table.from_pandas(pdf)) + # In the second test-case we need to take a detour via pyarrow + # Pandas omits "f1" in first row, so we have to enforce a common schema, + # such that pandas would have the f1 member with null + # Also, pyarrow chooses to select different ordering of a nested column + # children though key-value pairs are correct. + pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) + assert df.to_arrow().equals(pa_table_pdf) def test_json_nested_data(): @@ -629,5 +633,6 @@ def test_json_nested_data(): StringIO(json_str), engine="cudf_experimental", orient="records" ) pdf = pd.read_json(StringIO(json_str), orient="records") - - assert df.to_arrow().equals(pa.Table.from_pandas(pdf)) + pdf.columns = pdf.columns.astype('str') + pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) + assert df.to_arrow().equals(pa_table_pdf) From 55f9366bb3f752789cd682ce2fd27088fdb38ea5 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 14 Sep 2022 10:03:18 -0700 Subject: [PATCH 156/173] adds pytest for more types --- python/cudf/cudf/tests/test_json.py | 33 +++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 4e9fa2f8deb..cb767029949 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -618,9 +618,11 @@ def test_json_nested_lines(data): # In the second test-case we need to take a detour via pyarrow # Pandas omits "f1" in first row, so we have to enforce a common schema, # such that pandas would have the f1 member with null - # Also, pyarrow chooses to select different ordering of a nested column + # Also, pyarrow chooses to select different ordering of a nested column # children though key-value pairs are correct. - pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) assert df.to_arrow().equals(pa_table_pdf) @@ -633,6 +635,29 @@ def test_json_nested_data(): StringIO(json_str), engine="cudf_experimental", orient="records" ) pdf = pd.read_json(StringIO(json_str), orient="records") - pdf.columns = pdf.columns.astype('str') - pa_table_pdf = pa.Table.from_pandas(pdf, schema=df.to_arrow().schema, safe=False) + pdf.columns = pdf.columns.astype("str") + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) + + +def test_json_types_data(): + # 0:<0:string,1:float> + # 1:list + # 2:<0:bool> + json_str = ( + '[{"0":null,"2":{}},{"1":[123],"0":{"0":"foo","1":123.4},"2":{"0":false}},' + '{"0":{},"1":[],"2":{"0":null}}]' + ) + df = cudf.read_json( + StringIO(json_str), engine="cudf_experimental", orient="records" + ) + pdf = pd.read_json(StringIO(json_str), orient="records") + pdf.columns = pdf.columns.astype("str") + print(pdf) + print(df) + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) assert df.to_arrow().equals(pa_table_pdf) From c0123cf7fee11502d8050c34115d7a0cad570b57 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 14 Sep 2022 10:13:35 -0700 Subject: [PATCH 157/173] style fixes --- cpp/src/io/json/nested_json_gpu.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 32bbc5850da..45f140141d9 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1501,13 +1501,13 @@ std::pair, std::vector> json_column_to parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream); // Convert strings to the inferred data type - auto col = cudf::io::json::experimental::detail::parse_data(string_spans_it, - col_size, - target_type, - make_validity(json_col).first, - parsing_options(options).view(), - stream, - mr); + auto col = experimental::detail::parse_data(string_spans_it, + col_size, + target_type, + make_validity(json_col).first, + parsing_options(options).view(), + stream, + mr); // Reset nullable if we do not have nulls if (col->null_count() == 0) { col->set_null_mask({}); } @@ -1516,7 +1516,7 @@ std::pair, std::vector> json_column_to if (target_type.id() == type_id::STRING) { return {std::move(col), {{"offsets"}, {"chars"}}}; } - // Non-string columns do not have child columns in the schema + // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema else { return {std::move(col), {}}; } From e6272118c7e9d241a50b875b5d2581c5040878cc Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 15 Sep 2022 07:10:28 -0700 Subject: [PATCH 158/173] use sparse feature of logical stack --- cpp/src/io/json/nested_json_gpu.cu | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 45f140141d9..301c7da49a2 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -133,9 +133,9 @@ std::array, TT_NUM_STATES> const trans // Translation table (i.e., for each transition, what are the symbols that we output) std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{ {/* IN_STATE { [ } ] " \ OTHER */ - /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}}, - /* TT_STR */ {{{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}, - /* TT_ESC */ {{{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}}}; + /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}}}, + /* TT_STR */ {{{}, {}, {}, {}, {}, {}, {}}}, + /* TT_ESC */ {{{}, {}, {}, {}, {}, {}, {}}}}}; // The DFA's starting state constexpr auto start_state = static_cast(TT_OOS); @@ -958,10 +958,13 @@ void get_stack_context(device_span json_in, to_stack_op::start_state, stream); + // Copy back to actual number of stack operations + num_stack_ops.device_to_host(stream, true); + // stack operations with indices are converted to top of the stack for each character in the input fst::sparse_stack_op_to_top_of_stack( stack_ops.data(), - device_span{stack_op_indices.data(), stack_op_indices.size()}, + device_span{stack_op_indices.data(), num_stack_ops[0]}, JSONToStackOp{}, d_top_of_stack, root_symbol, From 6cef4eb3ec8a568d88ff7b2493978d4044e91588 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 15 Sep 2022 07:20:16 -0700 Subject: [PATCH 159/173] moves from hostdevice_v to d_scalar --- cpp/src/io/json/nested_json_gpu.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 301c7da49a2..08bfc4825e8 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include @@ -33,6 +32,7 @@ #include #include +#include #include #include @@ -931,7 +931,7 @@ void get_stack_context(device_span json_in, constexpr StackSymbolT read_symbol = 'x'; // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes) - hostdevice_vector num_stack_ops(single_item, stream); + rmm::device_scalar d_num_stack_ops(stream); // Sequence of stack symbols and their position in the original input (sparse representation) rmm::device_uvector stack_ops{json_in.size(), stream}; @@ -954,17 +954,17 @@ void get_stack_context(device_span json_in, static_cast(json_in.size()), stack_ops.data(), stack_op_indices.data(), - num_stack_ops.device_ptr(), + d_num_stack_ops.data(), to_stack_op::start_state, stream); // Copy back to actual number of stack operations - num_stack_ops.device_to_host(stream, true); + auto const num_stack_ops = d_num_stack_ops.value(stream); // stack operations with indices are converted to top of the stack for each character in the input fst::sparse_stack_op_to_top_of_stack( stack_ops.data(), - device_span{stack_op_indices.data(), num_stack_ops[0]}, + device_span{stack_op_indices.data(), num_stack_ops}, JSONToStackOp{}, d_top_of_stack, root_symbol, From b7e5eb67464faee00ffb5735f76d848e8277fc15 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 15 Sep 2022 08:11:59 -0700 Subject: [PATCH 160/173] adds test for nested column order --- cpp/tests/io/json_test.cpp | 63 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 232aaa51ef3..7f698774084 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1006,4 +1006,67 @@ TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions) } } +TEST_F(JsonReaderTest, TestColumnOrder) +{ + std::string const json_string = + // Expected order: + // root: b, c, a, d + // a: 2, 0, 1 + {R"({"b":"b0"} + {"c":"c1","a":{"2":null}} + {"d":"d2","a":{"0":"a2.0", "2":"a2.2"}} + {"b":"b3","a":{"1":null, "2":"a3.2"}})"}; + + std::vector const root_col_names{"b", "c", "a", "d"}; + std::vector const a_child_col_names{"2", "0", "1"}; + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.c_str(), json_string.size()}) + .lines(true) + .experimental(true); + + // Read in data using nested JSON reader + cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); + + // Verify root column order (assert to avoid OOB access) + ASSERT_EQ(new_reader_table.metadata.schema_info.size(), root_col_names.size()); + + for (std::size_t i = 0; i < a_child_col_names.size(); i++) { + auto const& root_col_name = root_col_names[i]; + EXPECT_EQ(new_reader_table.metadata.schema_info[i].name, root_col_name); + } + + // Verify nested child column order (assert to avoid OOB access) + ASSERT_EQ(new_reader_table.metadata.schema_info[2].children.size(), a_child_col_names.size()); + for (std::size_t i = 0; i < a_child_col_names.size(); i++) { + auto const& a_child_col_name = a_child_col_names[i]; + EXPECT_EQ(new_reader_table.metadata.schema_info[2].children[i].name, a_child_col_name); + } + + // Verify data of root columns + ASSERT_EQ(root_col_names.size(), new_reader_table.tbl->num_columns()); + column_wrapper root_col_data_b{{"b0", "", "", "b3"}, + {true, false, false, true}}; + column_wrapper root_col_data_c{{"", "c1", "", ""}, + {false, true, false, false}}; + column_wrapper root_col_data_d{{"", "", "d2", ""}, + {false, false, true, false}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(root_col_data_b, new_reader_table.tbl->get_column(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(root_col_data_c, new_reader_table.tbl->get_column(1)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(root_col_data_d, new_reader_table.tbl->get_column(3)); + + // Verify data of child columns of column 'a' + auto const col_a = new_reader_table.tbl->get_column(2); + ASSERT_EQ(a_child_col_names.size(), col_a.num_children()); + column_wrapper col_a2{{"", "", "a2.2", "a3.2"}, {false, false, true, true}}; + column_wrapper col_a0{{"", "", "a2.0", ""}, {false, false, true, false}}; + // col a.1 is inferred as all-null + int8_wrapper col_a1{{0, 0, 0, 0}, {false, false, false, false}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_a2, col_a.child(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_a0, col_a.child(1)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_a1, col_a.child(2)); +} + CUDF_TEST_PROGRAM_MAIN() From 36bf5710dfb212c53a4c5c4ab051c982d1cbd505 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 15 Sep 2022 08:26:34 -0700 Subject: [PATCH 161/173] fixes style --- python/cudf/cudf/tests/test_json.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index cb767029949..23096f04995 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -647,7 +647,8 @@ def test_json_types_data(): # 1:list # 2:<0:bool> json_str = ( - '[{"0":null,"2":{}},{"1":[123],"0":{"0":"foo","1":123.4},"2":{"0":false}},' + '[{"0":null,"2":{}},' + '{"1":[123],"0":{"0":"foo","1":123.4},"2":{"0":false}},' '{"0":{},"1":[],"2":{"0":null}}]' ) df = cudf.read_json( From 206ed8d0dc5af9c95caf55bfbe45fa5b6cb25207 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 16 Sep 2022 00:48:03 -0700 Subject: [PATCH 162/173] removes giving names to trivial values --- cpp/tests/io/nested_json_test.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 2ad51b9b7df..44a04b9a0b0 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -411,17 +411,15 @@ TEST_F(JsonTest, ExtractColumnWithQuotes) auto const cudf_table = cuio_json::detail::parse_nested_json( cudf::host_span{input.data(), input.size()}, options, stream); - auto constexpr expected_col_count = 2; - auto constexpr first_column_index = 0; - auto constexpr second_column_index = 1; + auto constexpr expected_col_count = 2; EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); auto expected_col1 = cudf::test::strings_column_wrapper({R"("0.0")", R"()", R"("2.0")"}, {true, false, true}); auto expected_col2 = cudf::test::fixed_width_column_wrapper({1.0, 1.1, 2.1}); - cudf::column_view parsed_col1 = cudf_table.tbl->get_column(first_column_index); + cudf::column_view parsed_col1 = cudf_table.tbl->get_column(0); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); - cudf::column_view parsed_col2 = cudf_table.tbl->get_column(second_column_index); + cudf::column_view parsed_col2 = cudf_table.tbl->get_column(1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2); } From 167f215feaeaf18d48b439265d831b5bee9c840d Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 16 Sep 2022 00:48:18 -0700 Subject: [PATCH 163/173] more tests on nested column inference --- cpp/tests/io/nested_json_test.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 44a04b9a0b0..73b10aad9f9 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -434,16 +434,25 @@ TEST_F(JsonTest, ExpectFailMixStructAndList) cudf::io::json_reader_options options{}; options.enable_keep_quotes(true); - std::vector const inputs{ + std::vector const inputs_fail{ R"( [{"a":[123], "b":1.0}, {"b":1.1}, {"b":2.1, "a":{"0":123}}] )", R"( [{"a":{"0":"foo"}, "b":1.0}, {"b":1.1}, {"b":2.1, "a":[123]}] )", R"( [{"a":{"0":null}, "b":1.0}, {"b":1.1}, {"b":2.1, "a":[123]}] )"}; - for (auto const& input : inputs) { - // Get the JSON's tree representation + std::vector const inputs_succeed{ + R"( [{"a":[123, {"0": 123}], "b":1.0}, {"b":1.1}, {"b":2.1}] )", + R"( [{"a":[123, "123"], "b":1.0}, {"b":1.1}, {"b":2.1}] )"}; + + for (auto const& input : inputs_fail) { CUDF_EXPECT_THROW_MESSAGE( auto const cudf_table = cuio_json::detail::parse_nested_json( cudf::host_span{input.data(), input.size()}, options, stream), "A mix of lists and structs within the same column is not supported"); } + + for (auto const& input : inputs_succeed) { + CUDF_EXPECT_NO_THROW( + auto const cudf_table = cuio_json::detail::parse_nested_json( + cudf::host_span{input.data(), input.size()}, options, stream)); + } } From 226f42de8d9728ec1e41eddc9a30bd341c30c425 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 16 Sep 2022 00:56:10 -0700 Subject: [PATCH 164/173] removes unused var --- cpp/src/io/json/nested_json_gpu.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 08bfc4825e8..b96b8b466c0 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -923,8 +923,6 @@ void get_stack_context(device_span json_in, // -> Logical stack to infer the stack context CUDF_FUNC_RANGE(); - constexpr std::size_t single_item = 1; - // Symbol representing the JSON-root (i.e., we're at nesting level '0') constexpr StackSymbolT root_symbol = '_'; // This can be any stack symbol from the stack alphabet that does not push onto stack From c70831e881531cc9a5b358959850a2ecf019b29b Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 16 Sep 2022 01:05:20 -0700 Subject: [PATCH 165/173] moves json_col member function definitions to source --- cpp/src/io/json/nested_json.hpp | 89 +---------------------------- cpp/src/io/json/nested_json_gpu.cu | 92 ++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 86 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 5479fbde2e8..d469bf8955a 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -142,19 +142,7 @@ struct json_column { * * @param up_to_row_offset The row offset up to which to fill with nulls. */ - void null_fill(row_offset_t up_to_row_offset) - { - // Fill all the rows up to up_to_row_offset with "empty"/null rows - validity.resize(word_index(up_to_row_offset) + 1); - std::fill_n(std::back_inserter(string_offsets), - up_to_row_offset - string_offsets.size(), - (string_offsets.size() > 0) ? string_offsets.back() : 0); - std::fill_n(std::back_inserter(string_lengths), up_to_row_offset - string_lengths.size(), 0); - std::fill_n(std::back_inserter(child_offsets), - up_to_row_offset + 1 - child_offsets.size(), - (child_offsets.size() > 0) ? child_offsets.back() : 0); - current_offset = up_to_row_offset; - } + void null_fill(row_offset_t up_to_row_offset); /** * @brief Recursively iterates through the tree of columns making sure that all child columns of a @@ -162,26 +150,7 @@ struct json_column { * * @param min_row_count The minimum number of rows to be filled. */ - void level_child_cols_recursively(row_offset_t min_row_count) - { - // Fill this columns with nulls up to the given row count - null_fill(min_row_count); - - // If this is a struct column, we need to level all its child columns - if (type == json_col_t::StructColumn) { - for (auto it = std::begin(child_columns); it != std::end(child_columns); it++) { - it->second.level_child_cols_recursively(min_row_count); - } - } - // If this is a list column, we need to make sure that its child column levels its children - else if (type == json_col_t::ListColumn) { - auto it = std::begin(child_columns); - // Make that child column fill its child columns up to its own row count - if (it != std::end(child_columns)) { - it->second.level_child_cols_recursively(it->second.current_offset); - } - } - } + void level_child_cols_recursively(row_offset_t min_row_count); /** * @brief Appends the row at the given index to the column, filling all rows between the column's @@ -198,59 +167,7 @@ struct json_column { json_col_t const& row_type, uint32_t string_offset, uint32_t string_end, - uint32_t child_count) - { - // If, thus far, the column's type couldn't be inferred, we infer it to the given type - if (type == json_col_t::Unknown) { - type = row_type; - } - // If, at some point within a column, we encounter a nested type (list or struct), - // we change that column's type to that respective nested type and invalidate all previous rows - else if (type == json_col_t::StringColumn && - (row_type == json_col_t::ListColumn || row_type == json_col_t::StructColumn)) { - // Change the column type - type = row_type; - - // Invalidate all previous entries, as they were _not_ of the nested type to which we just - // converted - std::fill_n(validity.begin(), validity.size(), 0); - valid_count = 0U; - } - // If this is a nested column but we're trying to insert either (a) a list node into a struct - // column or (b) a struct node into a list column, we fail - else if ((type == json_col_t::ListColumn && row_type == json_col_t::StructColumn) || - (type == json_col_t::StructColumn && row_type == json_col_t::ListColumn)) { - CUDF_FAIL("A mix of lists and structs within the same column is not supported"); - } - - // We shouldn't run into this, as we shouldn't be asked to append an "unknown" row type - // CUDF_EXPECTS(type != json_col_t::Unknown, "Encountered invalid JSON token sequence"); - - // Fill all the omitted rows with "empty"/null rows (if needed) - null_fill(row_index); - - // Table listing what we intend to use for a given column type and row type combination - // col type | row type => {valid, FAIL, null} - // ----------------------------------------------- - // List | List => valid - // List | Struct => FAIL - // List | String => null - // Struct | List => FAIL - // Struct | Struct => valid - // Struct | String => null - // String | List => valid (we switch col type to list, null'ing all previous rows) - // String | Struct => valid (we switch col type to list, null'ing all previous rows) - // String | String => valid - bool const is_valid = (type == row_type); - if (static_cast(validity.size()) < word_index(current_offset)) - validity.push_back({}); - if (is_valid) { set_bit_unsafe(&validity.back(), intra_word_index(current_offset)); } - valid_count += (is_valid) ? 1U : 0U; - string_offsets.push_back(string_offset); - string_lengths.push_back(string_end - string_offset); - child_offsets.push_back((child_offsets.size() > 0) ? child_offsets.back() + child_count : 0); - current_offset++; - }; + uint32_t child_count); }; /** diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index b96b8b466c0..f41d099e5c2 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -912,6 +912,98 @@ struct JSONToStackOp { } }; +void json_column::null_fill(row_offset_t up_to_row_offset) +{ + // Fill all the rows up to up_to_row_offset with "empty"/null rows + validity.resize(word_index(up_to_row_offset) + 1); + std::fill_n(std::back_inserter(string_offsets), + up_to_row_offset - string_offsets.size(), + (string_offsets.size() > 0) ? string_offsets.back() : 0); + std::fill_n(std::back_inserter(string_lengths), up_to_row_offset - string_lengths.size(), 0); + std::fill_n(std::back_inserter(child_offsets), + up_to_row_offset + 1 - child_offsets.size(), + (child_offsets.size() > 0) ? child_offsets.back() : 0); + current_offset = up_to_row_offset; +} + +void json_column::level_child_cols_recursively(row_offset_t min_row_count) +{ + // Fill this columns with nulls up to the given row count + null_fill(min_row_count); + + // If this is a struct column, we need to level all its child columns + if (type == json_col_t::StructColumn) { + for (auto it = std::begin(child_columns); it != std::end(child_columns); it++) { + it->second.level_child_cols_recursively(min_row_count); + } + } + // If this is a list column, we need to make sure that its child column levels its children + else if (type == json_col_t::ListColumn) { + auto it = std::begin(child_columns); + // Make that child column fill its child columns up to its own row count + if (it != std::end(child_columns)) { + it->second.level_child_cols_recursively(it->second.current_offset); + } + } +}; + +void json_column::append_row(uint32_t row_index, + json_col_t const& row_type, + uint32_t string_offset, + uint32_t string_end, + uint32_t child_count) +{ + // If, thus far, the column's type couldn't be inferred, we infer it to the given type + if (type == json_col_t::Unknown) { + type = row_type; + } + // If, at some point within a column, we encounter a nested type (list or struct), + // we change that column's type to that respective nested type and invalidate all previous rows + else if (type == json_col_t::StringColumn && + (row_type == json_col_t::ListColumn || row_type == json_col_t::StructColumn)) { + // Change the column type + type = row_type; + + // Invalidate all previous entries, as they were _not_ of the nested type to which we just + // converted + std::fill_n(validity.begin(), validity.size(), 0); + valid_count = 0U; + } + // If this is a nested column but we're trying to insert either (a) a list node into a struct + // column or (b) a struct node into a list column, we fail + else if ((type == json_col_t::ListColumn && row_type == json_col_t::StructColumn) || + (type == json_col_t::StructColumn && row_type == json_col_t::ListColumn)) { + CUDF_FAIL("A mix of lists and structs within the same column is not supported"); + } + + // We shouldn't run into this, as we shouldn't be asked to append an "unknown" row type + // CUDF_EXPECTS(type != json_col_t::Unknown, "Encountered invalid JSON token sequence"); + + // Fill all the omitted rows with "empty"/null rows (if needed) + null_fill(row_index); + + // Table listing what we intend to use for a given column type and row type combination + // col type | row type => {valid, FAIL, null} + // ----------------------------------------------- + // List | List => valid + // List | Struct => FAIL + // List | String => null + // Struct | List => FAIL + // Struct | Struct => valid + // Struct | String => null + // String | List => valid (we switch col type to list, null'ing all previous rows) + // String | Struct => valid (we switch col type to list, null'ing all previous rows) + // String | String => valid + bool const is_valid = (type == row_type); + if (static_cast(validity.size()) < word_index(current_offset)) validity.push_back({}); + if (is_valid) { set_bit_unsafe(&validity.back(), intra_word_index(current_offset)); } + valid_count += (is_valid) ? 1U : 0U; + string_offsets.push_back(string_offset); + string_lengths.push_back(string_end - string_offset); + child_offsets.push_back((child_offsets.size() > 0) ? child_offsets.back() + child_count : 0); + current_offset++; +}; + namespace detail { void get_stack_context(device_span json_in, From 633f57bc308acbc0cc6c3de6ca16225b6d5ccb00 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 16 Sep 2022 10:55:02 -0700 Subject: [PATCH 166/173] pass by values for simple type --- cpp/src/io/json/nested_json.hpp | 2 +- cpp/src/io/json/nested_json_gpu.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index d469bf8955a..8923e72ab16 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -164,7 +164,7 @@ struct json_column { * the offsets */ void append_row(uint32_t row_index, - json_col_t const& row_type, + json_col_t row_type, uint32_t string_offset, uint32_t string_end, uint32_t child_count); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index f41d099e5c2..71e0f854b04 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -948,7 +948,7 @@ void json_column::level_child_cols_recursively(row_offset_t min_row_count) }; void json_column::append_row(uint32_t row_index, - json_col_t const& row_type, + json_col_t row_type, uint32_t string_offset, uint32_t string_end, uint32_t child_count) From c5750ba03f82d3de9b7bbeb7d00761ad288f095d Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:22:13 -0700 Subject: [PATCH 167/173] style fix --- cpp/src/io/json/reader_impl.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index da6e7621449..743984e1c77 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -530,10 +530,10 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, 0, stream); auto repl = make_strings_column(cudf::detail::make_device_uvector_async(repl_chars, stream), - cudf::detail::make_device_uvector_async(repl_offsets, stream), - {}, - 0, - stream); + cudf::detail::make_device_uvector_async(repl_offsets, stream), + {}, + 0, + stream); auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream); std::vector> out_columns; From e5050e6bbb4ada54c370484c4c1612ec08a93438 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:28:57 -0700 Subject: [PATCH 168/173] revert style change --- cpp/src/io/json/reader_impl.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 743984e1c77..da6e7621449 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -530,10 +530,10 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, 0, stream); auto repl = make_strings_column(cudf::detail::make_device_uvector_async(repl_chars, stream), - cudf::detail::make_device_uvector_async(repl_offsets, stream), - {}, - 0, - stream); + cudf::detail::make_device_uvector_async(repl_offsets, stream), + {}, + 0, + stream); auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream); std::vector> out_columns; From 4c1d96cd80ba1135f5c008af9b4ea9595e0cc2ac Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:38:18 -0700 Subject: [PATCH 169/173] removes cudf_fail in favor of cudf_expects --- cpp/src/io/json/nested_json_gpu.cu | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 71e0f854b04..dc33e15fd7d 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -971,10 +971,9 @@ void json_column::append_row(uint32_t row_index, } // If this is a nested column but we're trying to insert either (a) a list node into a struct // column or (b) a struct node into a list column, we fail - else if ((type == json_col_t::ListColumn && row_type == json_col_t::StructColumn) || - (type == json_col_t::StructColumn && row_type == json_col_t::ListColumn)) { - CUDF_FAIL("A mix of lists and structs within the same column is not supported"); - } + CUDF_EXPECTS(not((type == json_col_t::ListColumn and row_type == json_col_t::StructColumn) or + (type == json_col_t::StructColumn and row_type == json_col_t::ListColumn)), + "A mix of lists and structs within the same column is not supported"); // We shouldn't run into this, as we shouldn't be asked to append an "unknown" row type // CUDF_EXPECTS(type != json_col_t::Unknown, "Encountered invalid JSON token sequence"); From 568cc53953ac3135beb7716d6f9505e44949ed24 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:40:39 -0700 Subject: [PATCH 170/173] re-enables extra logical check --- cpp/src/io/json/nested_json_gpu.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index dc33e15fd7d..42673b4967f 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -976,7 +976,7 @@ void json_column::append_row(uint32_t row_index, "A mix of lists and structs within the same column is not supported"); // We shouldn't run into this, as we shouldn't be asked to append an "unknown" row type - // CUDF_EXPECTS(type != json_col_t::Unknown, "Encountered invalid JSON token sequence"); + CUDF_EXPECTS(type != json_col_t::Unknown, "Encountered invalid JSON token sequence"); // Fill all the omitted rows with "empty"/null rows (if needed) null_fill(row_index); From 74445abccf288a3570be4f5b5cf1b43dea5711fa Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:42:01 -0700 Subject: [PATCH 171/173] fixes comment typos --- cpp/src/io/json/nested_json_gpu.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 42673b4967f..b93955f4120 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1546,7 +1546,7 @@ std::pair, std::vector> json_column_to rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - // Range of orchastrating/encapsulating function + // Range of orchestrating/encapsulating function CUDF_FUNC_RANGE(); auto make_validity = @@ -1673,7 +1673,7 @@ table_with_metadata parse_nested_json(host_span input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - // Range of orchastrating/encapsulating function + // Range of orchestrating/encapsulating function CUDF_FUNC_RANGE(); auto const new_line_delimited_json = options.is_enabled_lines(); From 2bfb987274be22780d41d90c8b9614f1940d314f Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:42:34 -0700 Subject: [PATCH 172/173] canonical way for returning empty null mask --- cpp/src/io/json/nested_json_gpu.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index b93955f4120..08a31fd45eb 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1602,7 +1602,7 @@ std::pair, std::vector> json_column_to mr); // Reset nullable if we do not have nulls - if (col->null_count() == 0) { col->set_null_mask({}); } + if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } // For string columns return ["offsets", "char"] schema if (target_type.id() == type_id::STRING) { From 028d5ac77ce6e1ee4f2c5a16149c40a9cde49b64 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 19 Sep 2022 14:43:13 -0700 Subject: [PATCH 173/173] removes debug prints from pytest --- python/cudf/cudf/tests/test_json.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 23096f04995..f6ca4691669 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -656,8 +656,6 @@ def test_json_types_data(): ) pdf = pd.read_json(StringIO(json_str), orient="records") pdf.columns = pdf.columns.astype("str") - print(pdf) - print(df) pa_table_pdf = pa.Table.from_pandas( pdf, schema=df.to_arrow().schema, safe=False )