Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle missing fields as nulls in get_json_object() #10970

Merged
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions cpp/include/cudf/strings/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ class get_json_object_options {
// individual string values are returned with quotes stripped.
bool strip_quotes_from_single_strings = true;

// Whether to return nulls when an object does not contain the requested field.
bool missing_fields_as_nulls = false;

public:
/**
* @brief Default constructor.
Expand Down Expand Up @@ -84,6 +87,30 @@ class get_json_object_options {
return strip_quotes_from_single_strings;
}

/**
* @brief Whether a field not contained by an object is to be interpreted as null.
*
* When set to true, if an object is queried for a field it does not contain, a null is returned.
*
* @code{.pseudo}
*
* With missing_fields_as_nulls OFF:
* Input = {"a" : [{"x": "1", "y": "2"}, {"x": "3"}]}
* Query = $.a[*].y
* Output = ["2"]
*
* With missing_fields_as_nulls ON:
* Input = {"a" : [{"x": "1", "y": "2"}, {"x": "3"}]}
* Query = $.a[*].y
* Output = ["2", null]
*
* @endcode
*/
[[nodiscard]] CUDF_HOST_DEVICE inline bool get_missing_fields_as_nulls() const
{
return missing_fields_as_nulls;
}

/**
* @brief Set whether single-quotes for strings are allowed.
*
Expand All @@ -103,6 +130,16 @@ class get_json_object_options {
{
strip_quotes_from_single_strings = _strip_quotes_from_single_strings;
}

/**
* @brief Set whether missing fields are interpreted as null.
*
* @param _missing_fields_as_nulls bool indicating desired behavior.
*/
void set_missing_fields_as_nulls(bool _missing_fields_as_nulls)
{
missing_fields_as_nulls = _missing_fields_as_nulls;
}
};

/**
Expand Down
28 changes: 16 additions & 12 deletions cpp/src/strings/json/json_path.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ namespace detail {

namespace {

// debug accessibility

// change to "\n" and 1 to make output more readable
#define DEBUG_NEWLINE
constexpr int DEBUG_NEWLINE_LEN = 0;
Expand All @@ -61,9 +59,10 @@ constexpr int DEBUG_NEWLINE_LEN = 0;
* or you get nothing back (parse_result::EMPTY)
*/
enum class parse_result {
ERROR, // failure
SUCCESS, // success
EMPTY, // success, but no data
ERROR, // failure
SUCCESS, // success
MISSING_FIELD, // success, but the field is missing
EMPTY, // success, but no data
};

/**
Expand Down Expand Up @@ -325,16 +324,18 @@ class json_state : private parser {
}
// loop until we find a match or there's nothing left
do {
// wildcard matches anything
if (name.size_bytes() == 1 && name.data()[0] == '*') {
return parse_result::SUCCESS;
} else if (cur_el_name == name) {
return parse_result::SUCCESS;
}

// next
parse_result result = next_element_internal(false);
if (result != parse_result::SUCCESS) { return result; }
if (result != parse_result::SUCCESS) {
return options.get_missing_fields_as_nulls() && result == parse_result::EMPTY
? parse_result::MISSING_FIELD
: result;
}
} while (true);

return parse_result::ERROR;
Expand Down Expand Up @@ -727,7 +728,6 @@ __device__ parse_result parse_json_path(json_state& j_state,
int element_count = 0;
while (pop_context(ctx)) {
path_operator op = *ctx.commands;

switch (op.type) {
// whatever the first object is
case path_operator_type::ROOT:
Expand All @@ -745,6 +745,12 @@ __device__ parse_result parse_json_path(json_state& j_state,
PARSE_TRY(ctx.j_state.next_matching_element(op.name, true));
if (last_result == parse_result::SUCCESS) {
push_context(ctx.j_state, ctx.commands + 1, ctx.list_element);
} else if (last_result == parse_result::MISSING_FIELD) {
if (ctx.list_element && element_count > 0) {
output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN});
}
output.add_output({"null", 4});
element_count++;
}
}
} break;
Expand Down Expand Up @@ -980,9 +986,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c

constexpr int block_size = 512;
cudf::detail::grid_1d const grid{col.size(), block_size};

auto cdv = column_device_view::create(col.parent(), stream);

// preprocess sizes (returned in the offsets buffer)
get_json_object_kernel<block_size>
<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
Expand Down Expand Up @@ -1014,6 +1018,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
// compute results
cudf::mutable_column_view chars_view(*chars);
rmm::device_scalar<size_type> d_valid_count{0, stream};

get_json_object_kernel<block_size>
<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
*cdv,
Expand All @@ -1023,7 +1028,6 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
static_cast<bitmask_type*>(validity.data()),
d_valid_count.data(),
options);

return make_strings_column(col.size(),
std::move(offsets),
std::move(chars),
Expand Down
47 changes: 46 additions & 1 deletion cpp/tests/strings/json_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -967,4 +967,49 @@ TEST_F(JsonPathTests, EscapeSequences)
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
}
}

TEST_F(JsonPathTests, MissingFieldsAsNulls)
{
std::string input_string{
// clang-format off
"{"
"\"tup\":"
"["
"{\"id\":\"1\",\"array\":[1,2]},"
"{\"id\":\"2\"},"
"{\"id\":\"3\",\"array\":[3,4]},"
"{\"id\":\"4\", \"a\": {\"x\": \"5\", \"y\": \"6\"}}"
"]"
"}"
// clang-format on
};
auto do_test = [&input_string](auto const& json_path_string,
auto const& default_output,
auto const& missing_fields_output,
bool default_valid = true) {
cudf::test::strings_column_wrapper input{input_string};
cudf::strings::get_json_object_options options;

// Test defualt behavior
options.set_missing_fields_as_nulls(false);
auto const default_result =
cudf::strings::get_json_object(cudf::strings_column_view(input), {json_path_string}, options);
cudf::test::strings_column_wrapper default_expected({default_output}, {default_valid});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(default_expected, *default_result);

// Test with missing fields as null
options.set_missing_fields_as_nulls(true);
auto const missing_fields_result =
cudf::strings::get_json_object(cudf::strings_column_view(input), {json_path_string}, options);
cudf::test::strings_column_wrapper missing_fields_expected({missing_fields_output}, {1});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(missing_fields_expected, *missing_fields_result);
};

do_test("$.tup[1].array", "", "null", false);
do_test("$.tup[*].array", "[[1,2],[3,4]]", "[[1,2],null,[3,4],null]");
do_test("$.x[*].array", "", "null", false);
do_test("$.tup[*].a.x", "[\"5\"]", "[null,null,null,\"5\"]");
}