Skip to content

Commit

Permalink
feat: Spark get_json_object function support incomplete json
Browse files Browse the repository at this point in the history
  • Loading branch information
leoluan2009 committed Feb 26, 2025
1 parent c560aaf commit 8133be9
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 1 deletion.
2 changes: 2 additions & 0 deletions CMake/resolve_dependency_modules/simdjson.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,5 @@ if(${VELOX_SIMDJSON_SKIPUTF8VALIDATION})
endif()

FetchContent_MakeAvailable(simdjson)
target_compile_definitions(simdjson
PUBLIC SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON)
6 changes: 6 additions & 0 deletions velox/functions/prestosql/json/SIMDJsonUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,10 @@ simdjson::simdjson_result<simdjson::ondemand::document> simdjsonParse(
return parser.iterate(json);
}

simdjson::simdjson_result<simdjson::ondemand::document> simdjsonParseIncomplete(
const simdjson::padded_string_view& json) {
thread_local simdjson::ondemand::parser parser;
return parser.iterate_allow_incomplete_json(json);
}

} // namespace facebook::velox
5 changes: 5 additions & 0 deletions velox/functions/prestosql/json/SIMDJsonUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,9 @@ void simdjsonErrorsToExceptions(
simdjson::simdjson_result<simdjson::ondemand::document> simdjsonParse(
const simdjson::padded_string_view& json);

/// Parse the input json string using a thread local on demand parser. Allow
/// incomplete json input.
simdjson::simdjson_result<simdjson::ondemand::document> simdjsonParseIncomplete(
const simdjson::padded_string_view& json);

} // namespace facebook::velox
2 changes: 1 addition & 1 deletion velox/functions/sparksql/GetJsonObject.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ struct GetJsonObjectFunction {
}
simdjson::ondemand::document jsonDoc;
simdjson::padded_string paddedJson(json.data(), json.size());
if (simdjsonParse(paddedJson).get(jsonDoc)) {
if (simdjsonParseIncomplete(paddedJson).get(jsonDoc)) {
return false;
}
const auto formattedJsonPath = jsonPath_.has_value()
Expand Down
29 changes: 29 additions & 0 deletions velox/functions/sparksql/tests/GetJsonObjectTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,5 +119,34 @@ TEST_F(GetJsonObjectTest, nullResult) {
std::nullopt);
}

TEST_F(GetJsonObjectTest, incompleteJson) {
EXPECT_EQ(getJsonObject(R"({"hello": "3.5"},)", "$.hello"), "3.5");
EXPECT_EQ(getJsonObject(R"({"hello": "3.5",,,,})", "$.hello"), "3.5");
EXPECT_EQ(
getJsonObject(R"({"hello": "3.5",,,,"taskSort":"2"})", "$.hello"), "3.5");
EXPECT_EQ(
getJsonObject(
R"({"hello": "3.5","taskSort":"2",,,,,"taskSort",})", "$.hello"),
"3.5");
EXPECT_EQ(
getJsonObject(R"({"hello": "3.5","taskSort":"2",,,,,,})", "$.hello"),
"3.5");
EXPECT_EQ(
getJsonObject(R"({"hello": "boy","taskSort":"2"},,,,,)", "$.hello"),
"boy");
EXPECT_EQ(getJsonObject(R"({"hello": "boy\n"},)", "$.hello"), "boy\n");
EXPECT_EQ(getJsonObject(R"({"hello": "boy\n\t"},)", "$.hello"), "boy\n\t");
EXPECT_EQ(
getJsonObject(
R"([{"my": {"info": {"name": "Alice"}}}, {"other": ["v1", "v2"]}],)",
"$[1].other[1]"),
"v2");
EXPECT_EQ(
getJsonObject(
R"({"my": {"info": {"name": "Alice", "age": "5", "id": "001"}}},)",
"$['my']['info']"),
R"({"name": "Alice", "age": "5", "id": "001"})");
}

} // namespace
} // namespace facebook::velox::functions::sparksql::test

0 comments on commit 8133be9

Please sign in to comment.