From 20a9a15b2fef2a46b9091c70184fa5b15e573b40 Mon Sep 17 00:00:00 2001 From: wecharyu Date: Tue, 31 Dec 2024 18:55:56 +0800 Subject: [PATCH 1/5] feat: Add Spark to_json function --- velox/docs/functions/spark/json.rst | 11 + .../registration/RegisterSpecialForm.cpp | 4 + .../sparksql/specialforms/CMakeLists.txt | 5 +- .../sparksql/specialforms/ToJson.cpp | 549 ++++++++++++++++++ .../functions/sparksql/specialforms/ToJson.h | 35 ++ velox/functions/sparksql/tests/CMakeLists.txt | 1 + velox/functions/sparksql/tests/ToJsonTest.cpp | 187 ++++++ 7 files changed, 790 insertions(+), 2 deletions(-) create mode 100644 velox/functions/sparksql/specialforms/ToJson.cpp create mode 100644 velox/functions/sparksql/specialforms/ToJson.h create mode 100644 velox/functions/sparksql/tests/ToJsonTest.cpp diff --git a/velox/docs/functions/spark/json.rst b/velox/docs/functions/spark/json.rst index 5f853f0698e1..fe5f57f343e3 100644 --- a/velox/docs/functions/spark/json.rst +++ b/velox/docs/functions/spark/json.rst @@ -44,3 +44,14 @@ JSON Functions SELECT json_object_keys(''); -- NULL SELECT json_object_keys(1); -- NULL SELECT json_object_keys('"hello"'); -- NULL + +.. spark:function:: to_json(json_object) -> jsonString + + Converts a Json object (ROW, ARRAY or MAP) into a JSON string. + The current implementation has following limitations. + + * Does not support user provided options. :: + + SELECT to_json(ROW(1, "a")); -- {"a":1} + SELECT to_json(ARRAY[1, 2, 3]); -- [1,2,3] + SELECT to_json(MAP(ARRAY['x', 'y'], ARRAY[1, 2])); -- {"x":1,"y":2} diff --git a/velox/functions/sparksql/registration/RegisterSpecialForm.cpp b/velox/functions/sparksql/registration/RegisterSpecialForm.cpp index d9f12abe4f80..bccd303c23af 100644 --- a/velox/functions/sparksql/registration/RegisterSpecialForm.cpp +++ b/velox/functions/sparksql/registration/RegisterSpecialForm.cpp @@ -20,6 +20,7 @@ #include "velox/functions/sparksql/specialforms/DecimalRound.h" #include "velox/functions/sparksql/specialforms/MakeDecimal.h" #include "velox/functions/sparksql/specialforms/SparkCastExpr.h" +#include "velox/functions/sparksql/specialforms/ToJson.h" namespace facebook::velox::functions { void registerSparkSpecialFormFunctions() { @@ -44,6 +45,9 @@ void registerSpecialFormGeneralFunctions(const std::string& prefix) { "cast", std::make_unique()); registerFunctionCallToSpecialForm( "try_cast", std::make_unique()); + exec::registerFunctionCallToSpecialForm( + ToJsonCallToSpecialForm::kToJson, + std::make_unique()); } } // namespace sparksql } // namespace facebook::velox::functions diff --git a/velox/functions/sparksql/specialforms/CMakeLists.txt b/velox/functions/sparksql/specialforms/CMakeLists.txt index e141e0074bc8..e0c5e741f3a1 100644 --- a/velox/functions/sparksql/specialforms/CMakeLists.txt +++ b/velox/functions/sparksql/specialforms/CMakeLists.txt @@ -18,7 +18,8 @@ velox_add_library( DecimalRound.cpp MakeDecimal.cpp SparkCastExpr.cpp - SparkCastHooks.cpp) + SparkCastHooks.cpp + ToJson.cpp) velox_link_libraries(velox_functions_spark_specialforms fmt::fmt - velox_expression) + velox_functions_json velox_expression) diff --git a/velox/functions/sparksql/specialforms/ToJson.cpp b/velox/functions/sparksql/specialforms/ToJson.cpp new file mode 100644 index 000000000000..4e8edc2443b2 --- /dev/null +++ b/velox/functions/sparksql/specialforms/ToJson.cpp @@ -0,0 +1,549 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/functions/sparksql/specialforms/ToJson.h" +#include "velox/functions/prestosql/types/JsonType.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace facebook::velox::functions::sparksql { +namespace { + +template +void generateJsonTyped( + const SimpleVector& input, + int row, + std::string& result, + const TypePtr& type) { + auto value = input.valueAt(row); + + if constexpr (std::is_same_v) { + size_t resultSize = escapedStringSize(value.data(), value.size()); + result.resize(resultSize + 2); + result.data()[0] = '"'; + escapeString(value.data(), value.size(), result.data() + 1); + result.data()[resultSize + 1] = '"'; + } else if constexpr (std::is_same_v) { + VELOX_FAIL( + "Convert UNKNOWN to JSON: Vextors of UNKNOWN type should not contain non-null rows"); + } else { + if constexpr (std::is_same_v) { + result.append(value ? "true" : "false"); + } else if constexpr ( + std::is_same_v || std::is_same_v) { + if (FOLLY_UNLIKELY(std::isinf(value) || std::isnan(value))) { + result.append(fmt::format( + "\"{}\"", + util::Converter::tryCast(value).value())); + } else { + result.append( + util::Converter::tryCast(value).value()); + } + } else if constexpr (std::is_same_v) { + std::string stringValue = std::to_string(value); + result.reserve(stringValue.size() + 2); + result.append("\""); + result.append(stringValue); + result.append("\""); + } else if (type->isDate()) { + std::string stringValue = DATE()->toString(value); + result.reserve(stringValue.size() + 2); + result.append("\""); + result.append(stringValue); + result.append("\""); + } else if (type->isDecimal()) { + result.append(DecimalUtil::toString(value, type)); + } else { + folly::toAppend(value, &result); + } + } +} + +// Convert primitive-type input vectors to Json string. +template < + TypeKind kind, + typename std::enable_if_t::isPrimitiveType, int> = 0> +void toJson( + const BaseVector& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + FlatVector& flatResult) { + using T = typename TypeTraits::NativeType; + + // input is guaranteed to be in flat or constant encodings when passed in. + auto inputVector = input.as>(); + + std::string result; + context.applyToSelectedNoThrow(rows, [&](auto row) { + if (inputVector->isNullAt(row)) { + flatResult.set(row, "null"); + } else { + result.clear(); + generateJsonTyped(*inputVector, row, result, inputVector->type()); + + flatResult.set(row, StringView(result)); + } + }); +} + +// Forward declaration. +void toJsonFromRow( + const BaseVector& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + FlatVector& flatResult); + +void toJsonFromArray( + const BaseVector& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + FlatVector& flatResult); + +void toJsonFromMap( + const BaseVector& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + FlatVector& flatResult); + +// Convert complex-type input vectors to Json string. +template < + TypeKind kind, + typename std::enable_if_t::isPrimitiveType, int> = 0> +void toJson( + const BaseVector& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + FlatVector& flatResult) { + if constexpr (kind == TypeKind::ROW) { + toJsonFromRow(input, context, rows, flatResult); + } else if constexpr (kind == TypeKind::ARRAY) { + toJsonFromArray(input, context, rows, flatResult); + } else if constexpr (kind == TypeKind::MAP) { + toJsonFromMap(input, context, rows, flatResult); + } else { + VELOX_FAIL("{} is not supported in to_json.", input.type()->toString()); + } +} + +// Helper struct representing the Json vector of input. +struct AsJson { + AsJson( + exec::EvalCtx& context, + const VectorPtr& input, + const SelectivityVector& rows, + const BufferPtr& elementToTopLevelRows) + : decoded_(context) { + VELOX_CHECK(rows.hasSelections()); + + exec::EvalErrorsPtr oldErrors; + context.swapErrors(oldErrors); + if (isJsonType(input->type())) { + json_ = input; + } else { + if (!exec::PeeledEncoding::isPeelable(input->encoding())) { + serialize(context, input, rows, json_); + } else { + exec::withContextSaver([&](exec::ContextSaver& saver){ + exec::LocalSelectivityVector newRowsHodler(*context.execCtx()); + + exec::LocalDecodedVector localDecoded(context); + std::vector peeledVectors; + auto peeledEncoding = exec::PeeledEncoding::peel( + {input}, rows, localDecoded, true, peeledVectors); + VELOX_CHECK_EQ(peeledVectors.size(), 1); + auto newRows = + peeledEncoding->translateToInnerRows(rows, newRowsHodler); + // Save context and set the peel + context.saveAndReset(saver, rows); + context.setPeeledEncoding(peeledEncoding); + + serialize(context, peeledVectors[0], *newRows, json_); + json_ = context.getPeeledEncoding()->wrap( + json_->type(), context.pool(), json_, rows); + }); + } + } + decoded_.get()->decode(*json_, rows); + jsonStrings_ = decoded_->base()->as>(); + + combineErrors(context, rows, elementToTopLevelRows, oldErrors); + } + + StringView at(vector_size_t i) const { + return jsonStrings_->valueAt(decoded_->index(i)); + } + + // Returns the length of the json string of the value at i, when this + // value will be inlined as an element in the json string of an array, map, or + // row. + vector_size_t lengthAt(vector_size_t i) const { + if (decoded_->isNullAt(i)) { + // Null values are inlined as "null". + return 4; + } else { + return this->at(i).size(); + } + } + + // Appends the json string of the value at i to a string writer. + void append(vector_size_t i, exec::StringWriter<>& proxy) const { + if (decoded_->isNullAt(i)) { + proxy.append("null"); + } else { + proxy.append(this->at(i)); + } + } + + private: + void serialize( + exec::EvalCtx& context, + const VectorPtr& input, + const SelectivityVector& baseRows, + VectorPtr& result) { + context.ensureWritable(baseRows, JSON(), result); + auto flatJsonStrings = result->as>(); + + VELOX_DYNAMIC_TYPE_DISPATCH_ALL( + toJson, + input->typeKind(), + *input, + context, + baseRows, + *flatJsonStrings); + } + + // Combine exceptions in oldErrors into context.errors_ with a transformation + // of rows mapping provided by elementToTopLevelRows. If there are exceptions + // at the same row in both context.errors_ and oldErrors, the one in oldErrors + // remains. elementToTopLevelRows can be a nullptr, meaning that the rows in + // context.errors_ correspond to rows in oldErrors exactly. + void combineErrors( + exec::EvalCtx& context, + const SelectivityVector& rows, + const BufferPtr& elementToTopLevelRows, + exec::EvalErrorsPtr& oldErrors) { + if (context.errors()) { + if (elementToTopLevelRows) { + context.addElementErrorsToTopLevel( + rows, elementToTopLevelRows, oldErrors); + } else { + context.addErrors(rows, *context.errorsPtr(), oldErrors); + } + } + context.swapErrors(oldErrors); + } + + exec::LocalDecodedVector decoded_; + VectorPtr json_; + const SimpleVector* jsonStrings_; +}; + +void toJsonFromRow( + const BaseVector& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + FlatVector& flatResult) { + // input is guaranteed to be in flat encoding when passed in. + VELOX_CHECK_EQ(input.encoding(), VectorEncoding::Simple::ROW); + auto inputRow = input.as(); + auto childrenSize = inputRow->childrenSize(); + + auto& rowType = inputRow->type()->asRow(); + VELOX_CHECK_EQ(rowType.size(), childrenSize, "Mismatch in row type size"); + + // Estimates an upperbound of the total length of all Json strings for the + // input according to the length of all children Json strings and the + // delimiters to be added. + size_t childrenStringSize = 0; + std::vector childrenAsJson; + for (int i = 0; i < childrenSize; ++i) { + childrenAsJson.emplace_back(context, inputRow->childAt(i), rows, nullptr); + + context.applyToSelectedNoThrow(rows, [&](auto row) { + if (inputRow->isNullAt(row)) { + // "null" will be inlined in the StringView. + return; + } + childrenStringSize += childrenAsJson[i].lengthAt(row); + }); + } + + // Extra length for commas and brackets + childrenStringSize += + rows.countSelected() * (childrenSize > 0 ? childrenSize + 1 : 2); + flatResult.getBufferWithSpace(childrenStringSize); + + // Constructs Json string of each row from Json strings of its children. + context.applyToSelectedNoThrow(rows, [&](auto row) { + if (inputRow->isNullAt(row)) { + flatResult.set(row, "null"); + return; + } + + auto proxy = exec::StringWriter<>(&flatResult, row); + + proxy.append("{"_sv); + for (int i = 0; i < childrenSize; ++i) { + if (i > 0) { + proxy.append(","_sv); + } + + proxy.append("\""_sv); + proxy.append(rowType.nameOf(i)); + proxy.append("\":"_sv); + + childrenAsJson[i].append(row, proxy); + } + proxy.append("}"_sv); + + proxy.finalize(); + }); +} + +void toJsonFromArray( + const BaseVector& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + FlatVector& flatResult) { + // input is guranteed to be in flat encoding when passed in. + auto inputArray = input.as(); + + auto elements = inputArray->elements(); + auto elementsRows = + functions::toElementRows(elements->size(), rows, inputArray); + if (!elementsRows.hasSelections()) { + // All arrays are null or empty. + context.applyToSelectedNoThrow(rows, [&](auto row) { + if (inputArray->isNullAt(row)) { + flatResult.set(row, "null"); + } else { + VELOX_CHECK_EQ( + inputArray->sizeAt(row), + 0, + "All arrays are expected to be null or empty"); + flatResult.set(row, "[]"); + } + }); + return; + } + + auto elementToTopLevelRows = functions::getElementToTopLevelRows( + elements->size(), rows, inputArray, context.pool()); + AsJson elementsAsJson{ + context, elements, elementsRows, elementToTopLevelRows}; + + // Estimates an upperbound of the total length of all Json strings for the + // input according to the length of all elements Json strings and the + // delimiters to be added. + size_t elementsStringSize = 0; + context.applyToSelectedNoThrow(rows, [&](auto row) { + if (inputArray->isNullAt(row)) { + // "null" will be inlined in the StringView. + return; + } + + auto offset = inputArray->offsetAt(row); + auto size = inputArray->sizeAt(row); + for (auto i = offset, end = offset + size; i < end; ++i) { + elementsStringSize += elementsAsJson.lengthAt(i); + } + + // Extra length for commas and brackets. + elementsStringSize += size > 0 ? size + 1 : 2; + }); + + flatResult.getBufferWithSpace(elementsStringSize); + + // Constructs the Json string of each array from Json strings of its elements. + context.applyToSelectedNoThrow(rows, [&](auto row) { + if (inputArray->isNullAt(row)) { + flatResult.set(row, "null"); + return; + } + + auto offset = inputArray->offsetAt(row); + auto size = inputArray->sizeAt(row); + + auto proxy = exec::StringWriter<>(&flatResult, row); + + proxy.append("["_sv); + for (int i = offset, end = offset + size; i < end; ++i) { + if (i > offset) { + proxy.append(","_sv); + } + elementsAsJson.append(i, proxy); + } + proxy.append("]"_sv); + + proxy.finalize(); + }); +} + +void toJsonFromMap( + const BaseVector& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + FlatVector& flatResult) { + // input is guaranteed to be in flat encoding when passed in. + auto inputMap = input.as(); + auto& mapType = inputMap->type()->asMap(); + + auto mapKeys = inputMap->mapKeys(); + auto mapValues = inputMap->mapValues(); + auto elementsRows = functions::toElementRows(mapKeys->size(), rows, inputMap); + if (!elementsRows.hasSelections()) { + // All maps are null or empty. + context.applyToSelectedNoThrow(rows, [&](auto row) { + if (inputMap->isNullAt(row)) { + flatResult.set(row, "null"); + } else { + VELOX_CHECK_EQ( + inputMap->sizeAt(row), + 0, + "All maps are expected to be null or empty"); + flatResult.set(row, "{}"); + } + }); + return; + } + + auto elementToTopLevelRows = functions::getElementToTopLevelRows( + mapKeys->size(), rows, inputMap, context.pool()); + + AsJson keysAsJson{ + context, mapKeys, elementsRows, elementToTopLevelRows}; + AsJson valuesAsJson{ + context, mapValues, elementsRows, elementToTopLevelRows}; + + // Estimates an upperbound of the total length of all Json strings for the + // input according to the length of all elements Json strings and the + // delimiters to be added. + size_t elementsStringSize = 0; + context.applyToSelectedNoThrow(rows, [&](auto row) { + if (inputMap->isNullAt(row)) { + // "null" will be inlined in the StringView. + return; + } + + auto offset = inputMap->offsetAt(row); + auto size = inputMap->sizeAt(row); + for (auto i = offset, end = offset + size; i < end; ++i) { + // The construction of keysAsJson ensured there is no null in keysAsJson + elementsStringSize += keysAsJson.at(i).size() + valuesAsJson.lengthAt(i); + } + + // Extra length for commas, semicolons, and curly braces. + elementsStringSize += size > 0 ? size * 2 + 1 : 2; + }); + + flatResult.getBufferWithSpace(elementsStringSize); + + // Constructs the Json string of each map from Json strings of its keys and + // values. + std::vector> sortedKeys; + context.applyToSelectedNoThrow(rows, [&](auto row) { + if (inputMap->isNullAt(row)) { + flatResult.set(row, "null"); + return; + } + + auto offset = inputMap->offsetAt(row); + auto size = inputMap->sizeAt(row); + + // Sort entries by keys in each map. + sortedKeys.clear(); + for (int i = offset, end = offset + size; i < end; ++i) { + sortedKeys.push_back(std::make_pair(keysAsJson.at(i), i)); + } + std::sort(sortedKeys.begin(), sortedKeys.end()); + + auto proxy = exec::StringWriter<>(&flatResult, row); + + proxy.append("{"_sv); + for (auto it = sortedKeys.begin(); it != sortedKeys.end(); ++it) { + if (it != sortedKeys.begin()) { + proxy.append(","_sv); + } + std::string keyFormat = mapType.childAt(0)->isVarchar() ? "{}:" : "\"{}\":"; + proxy.append(fmt::format(keyFormat, it->first)); + valuesAsJson.append(it->second, proxy); + } + proxy.append("}"_sv); + + proxy.finalize(); + }); +} + +class ToJsonFunction final : public exec::VectorFunction { + public: + void apply( + const SelectivityVector& rows, + std::vector& args, // Not using const ref so we can reuse args + const TypePtr& outputType, + exec::EvalCtx& context, + VectorPtr& result) const final { + context.ensureWritable(rows, outputType, result); + result->clearNulls(rows); + auto* rawResults = result->as>(); + + VELOX_DYNAMIC_TYPE_DISPATCH_ALL( + toJson, args[0]->typeKind(), *args[0], context, rows, *rawResults); + } +}; + +} // namespace + +TypePtr ToJsonCallToSpecialForm::resolveType(const std::vector& /* argTypes */) { + VELOX_FAIL("to_json function does not support type resolution."); +} + +exec::ExprPtr ToJsonCallToSpecialForm::constructSpecialForm( + const TypePtr &type, + std::vector&& args, + bool trackCpuUsage, + const core::QueryConfig& /* config */) { + VELOX_USER_CHECK(type->isVarchar(), "The result type of to_json should be VARCHAR"); + VELOX_USER_CHECK_EQ(args.size(), 1, "to_json expects one argument."); + VELOX_USER_CHECK( + args[0]->type()->isRow() || args[0]->type()->isArray() || args[0]->type()->isMap(), + "The argument type of to_json should be row, array or map."); + + return std::make_shared( + type, + std::move(args), + std::make_shared(), + exec::VectorFunctionMetadata{}, + kToJson, + trackCpuUsage); +} +} // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/specialforms/ToJson.h b/velox/functions/sparksql/specialforms/ToJson.h new file mode 100644 index 000000000000..b4a946c05b59 --- /dev/null +++ b/velox/functions/sparksql/specialforms/ToJson.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/expression/FunctionCallToSpecialForm.h" + +namespace facebook::velox::functions::sparksql { + +class ToJsonCallToSpecialForm : public exec::FunctionCallToSpecialForm { + public: + + TypePtr resolveType(const std::vector& argTypes) override; + + exec::ExprPtr constructSpecialForm( + const TypePtr& type, + std::vector&& args, + bool trackCpuUsage, + const core::QueryConfig& config) override; + + static constexpr const char* kToJson = "to_json"; +}; +} // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/tests/CMakeLists.txt b/velox/functions/sparksql/tests/CMakeLists.txt index 39087bd8adb5..f8dedc8cdcd4 100644 --- a/velox/functions/sparksql/tests/CMakeLists.txt +++ b/velox/functions/sparksql/tests/CMakeLists.txt @@ -55,6 +55,7 @@ add_executable( SplitTest.cpp StringTest.cpp StringToMapTest.cpp + ToJsonTest.cpp UnscaledValueFunctionTest.cpp UuidTest.cpp XxHash64Test.cpp) diff --git a/velox/functions/sparksql/tests/ToJsonTest.cpp b/velox/functions/sparksql/tests/ToJsonTest.cpp new file mode 100644 index 000000000000..036d232b4127 --- /dev/null +++ b/velox/functions/sparksql/tests/ToJsonTest.cpp @@ -0,0 +1,187 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/common/base/tests/GTestUtils.h" +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace facebook::velox::test; + +namespace facebook::velox::functions::sparksql::test { +namespace { +constexpr float kNaNFloat = std::numeric_limits::quiet_NaN(); +constexpr float kInfFloat = std::numeric_limits::infinity(); +constexpr double kNaNDouble = std::numeric_limits::quiet_NaN(); +constexpr double kInfDouble = std::numeric_limits::infinity(); + +class ToJsonTest : public SparkFunctionBaseTest { + protected: + core::CallTypedExprPtr createToJson(const TypePtr& inputType) { + std::vector inputs = { + std::make_shared(inputType, "c0")}; + return std::make_shared( + VARCHAR(), std::move(inputs), "to_json"); + } + + void testToJson(const VectorPtr& input, const VectorPtr& expected) { + auto expr = createToJson(input->type()); + testEncodings(expr, {input}, expected); + } +}; + +TEST_F(ToJsonTest, basicStruct) { + auto input = makeRowVector({"a"}, {makeFlatVector({1, 2, 3})}); + auto expected = makeFlatVector( + {R"({"a":1})", R"({"a":2})", R"({"a":3})"}); + testToJson(input, expected); +} + +TEST_F(ToJsonTest, basicArray) { + auto input = makeArrayVector({{1}, {2, 3}, {}}); + auto expected = makeFlatVector({R"([1])", R"([2,3])", R"([])"}); + testToJson(input, expected); +} + +TEST_F(ToJsonTest, basicMap) { + auto input = makeMapVector( + {{{"a", 1}}, {{"b", 2}}, {{"c", 3}}}); + auto expected = makeFlatVector( + {R"({"a":1})", R"({"b":2})", R"({"c":3})"}); + testToJson(input, expected); +} + +TEST_F(ToJsonTest, basicBool) { + auto data = makeNullableFlatVector( + {true, false, std::nullopt}); + auto input = makeRowVector({"a"}, {data}); + auto expected = makeFlatVector( + {R"({"a":true})", R"({"a":false})", R"({"a":null})"}); + testToJson(input, expected); +} + +TEST_F(ToJsonTest, basicTinyInt) { + auto data = makeNullableFlatVector({0, 127, 128, -128, -129, std::nullopt}); + auto input = makeRowVector({"a"}, {data}); + auto expected = makeFlatVector( + {R"({"a":0})", + R"({"a":127})", + R"({"a":-128})", + R"({"a":-128})", + R"({"a":127})", + R"({"a":null})"}); + testToJson(input, expected); +} + +TEST_F(ToJsonTest, basicSmallInt) { + auto data = makeNullableFlatVector({0, 32768, -32769, std::nullopt}); + auto input = makeRowVector({"a"}, {data}); + auto expected = makeFlatVector( + {R"({"a":0})", + R"({"a":-32768})", + R"({"a":32767})", + R"({"a":null})"}); + testToJson(input, expected); +} + +TEST_F(ToJsonTest, basicInt) { + auto data = makeNullableFlatVector({0, 2147483648, -2147483649, std::nullopt}); + auto input = makeRowVector({"a"}, {data}); + auto expected = makeFlatVector( + {R"({"a":0})", + R"({"a":-2147483648})", + R"({"a":2147483647})", + R"({"a":null})"}); + testToJson(input, expected); +} + +TEST_F(ToJsonTest, basicFloat) { + auto data = makeNullableFlatVector( + {1.0, kNaNFloat, kInfFloat, -kInfFloat, std::nullopt}); + auto input = makeRowVector({"a"}, {data}); + auto expected = makeFlatVector( + {R"({"a":1.0})", + R"({"a":"NaN"})", + R"({"a":"Infinity"})", + R"({"a":"-Infinity"})", + R"({"a":null})"}); + testToJson(input, expected); +} + +TEST_F(ToJsonTest, basicDouble) { + auto data = makeNullableFlatVector( + {1.0, kNaNDouble, kInfDouble, -kInfDouble, std::nullopt}); + auto input = makeRowVector({"a"}, {data}); + auto expected = makeFlatVector( + {R"({"a":1.0})", + R"({"a":"NaN"})", + R"({"a":"Infinity"})", + R"({"a":"-Infinity"})", + R"({"a":null})"}); + testToJson(input, expected); +} + +TEST_F(ToJsonTest, basicTimestamp) { + auto data = makeNullableFlatVector( + {Timestamp(0, 0), + Timestamp(1582934400, 0), + Timestamp(-2208988800, 0), + Timestamp(253402300799, 0), + std::nullopt}); + auto input = makeRowVector({"a"}, {data}); + auto expected = makeFlatVector( + {R"({"a":"1970-01-01T00:00:00.000000000"})", + R"({"a":"2020-02-29T00:00:00.000000000"})", + R"({"a":"1900-01-01T00:00:00.000000000"})", + R"({"a":"9999-12-31T23:59:59.000000000"})", + R"({"a":null})"}); + testToJson(input, expected); +} + +TEST_F(ToJsonTest, basicDate) { + auto data = makeNullableFlatVector( + {0, 18321, -25567, 2932896, std::nullopt}, DateType::get()); + auto input = makeRowVector({"a"}, {data}); + auto expected = makeFlatVector( + {R"({"a":"1970-01-01"})", + R"({"a":"2020-02-29"})", + R"({"a":"1900-01-01"})", + R"({"a":"9999-12-31"})", + R"({"a":null})"}); + testToJson(input, expected); +} + +TEST_F(ToJsonTest, nestedComplexType) { + auto data1 = makeNullableFlatVector({"str1", "str2", "str3"}); + auto data2 = makeNullableArrayVector({ + {1, 2, 3}, + {}, + {std::nullopt}}); + auto data3 = makeMapVector( + {{{"key1", 1}}, {{"key2", 2}}, {{"key3", 3}}}); + auto input = makeRowVector({"a", "b", "c"}, {data1, data2, data3}); + auto expected = makeFlatVector( + {R"({"a":"str1","b":[1,2,3],"c":{"key1":1}})", + R"({"a":"str2","b":[],"c":{"key2":2}})", + R"({"a":"str3","b":[null],"c":{"key3":3}})"}); + testToJson(input, expected); +} +} // namespace +} // namespace facebook::velox::functions::sparksql::test From e0e7f28b8595939d93383a9bd49f843618abd73a Mon Sep 17 00:00:00 2001 From: wecharyu Date: Wed, 22 Jan 2025 19:22:38 +0800 Subject: [PATCH 2/5] remove special form of ToJsonFunction --- velox/docs/functions/spark/json.rst | 2 +- velox/functions/sparksql/CMakeLists.txt | 1 + .../sparksql/{specialforms => }/ToJson.cpp | 37 +++++++------------ .../sparksql/registration/RegisterJson.cpp | 2 + .../registration/RegisterSpecialForm.cpp | 4 -- .../sparksql/specialforms/CMakeLists.txt | 5 +-- .../functions/sparksql/specialforms/ToJson.h | 35 ------------------ 7 files changed, 20 insertions(+), 66 deletions(-) rename velox/functions/sparksql/{specialforms => }/ToJson.cpp (94%) delete mode 100644 velox/functions/sparksql/specialforms/ToJson.h diff --git a/velox/docs/functions/spark/json.rst b/velox/docs/functions/spark/json.rst index fe5f57f343e3..85697abe25b2 100644 --- a/velox/docs/functions/spark/json.rst +++ b/velox/docs/functions/spark/json.rst @@ -45,7 +45,7 @@ JSON Functions SELECT json_object_keys(1); -- NULL SELECT json_object_keys('"hello"'); -- NULL -.. spark:function:: to_json(json_object) -> jsonString +.. spark:function:: to_json(jsonObject) -> jsonString Converts a Json object (ROW, ARRAY or MAP) into a JSON string. The current implementation has following limitations. diff --git a/velox/functions/sparksql/CMakeLists.txt b/velox/functions/sparksql/CMakeLists.txt index 2e60515c925c..508050460f96 100644 --- a/velox/functions/sparksql/CMakeLists.txt +++ b/velox/functions/sparksql/CMakeLists.txt @@ -29,6 +29,7 @@ velox_add_library( RegexFunctions.cpp Size.cpp String.cpp + ToJson.cpp UnscaledValueFunction.cpp) velox_link_libraries( diff --git a/velox/functions/sparksql/specialforms/ToJson.cpp b/velox/functions/sparksql/ToJson.cpp similarity index 94% rename from velox/functions/sparksql/specialforms/ToJson.cpp rename to velox/functions/sparksql/ToJson.cpp index 4e8edc2443b2..a4e9df94e9e6 100644 --- a/velox/functions/sparksql/specialforms/ToJson.cpp +++ b/velox/functions/sparksql/ToJson.cpp @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "velox/functions/sparksql/specialforms/ToJson.h" #include "velox/functions/prestosql/types/JsonType.h" +#include #include #include #include @@ -519,31 +519,22 @@ class ToJsonFunction final : public exec::VectorFunction { VELOX_DYNAMIC_TYPE_DISPATCH_ALL( toJson, args[0]->typeKind(), *args[0], context, rows, *rawResults); } + + static std::vector> signatures() { + // T(ROW/ARRAY/MAP) -> varchar + return {exec::FunctionSignatureBuilder() + .typeVariable("T") + .returnType("varchar") + .argumentType("T") + .build()}; + } }; } // namespace -TypePtr ToJsonCallToSpecialForm::resolveType(const std::vector& /* argTypes */) { - VELOX_FAIL("to_json function does not support type resolution."); -} +VELOX_DECLARE_VECTOR_FUNCTION( + udf_to_json, + ToJsonFunction::signatures(), + std::make_unique()); -exec::ExprPtr ToJsonCallToSpecialForm::constructSpecialForm( - const TypePtr &type, - std::vector&& args, - bool trackCpuUsage, - const core::QueryConfig& /* config */) { - VELOX_USER_CHECK(type->isVarchar(), "The result type of to_json should be VARCHAR"); - VELOX_USER_CHECK_EQ(args.size(), 1, "to_json expects one argument."); - VELOX_USER_CHECK( - args[0]->type()->isRow() || args[0]->type()->isArray() || args[0]->type()->isMap(), - "The argument type of to_json should be row, array or map."); - - return std::make_shared( - type, - std::move(args), - std::make_shared(), - exec::VectorFunctionMetadata{}, - kToJson, - trackCpuUsage); -} } // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/registration/RegisterJson.cpp b/velox/functions/sparksql/registration/RegisterJson.cpp index 7f41807ceb94..69c752d5a6ed 100644 --- a/velox/functions/sparksql/registration/RegisterJson.cpp +++ b/velox/functions/sparksql/registration/RegisterJson.cpp @@ -16,6 +16,7 @@ #include "velox/functions/lib/RegistrationHelpers.h" #include "velox/functions/sparksql/GetJsonObject.h" #include "velox/functions/sparksql/JsonObjectKeys.h" +#include namespace facebook::velox::functions::sparksql { @@ -24,6 +25,7 @@ void registerJsonFunctions(const std::string& prefix) { {prefix + "get_json_object"}); registerFunction, Varchar>( {prefix + "json_object_keys"}); + VELOX_REGISTER_VECTOR_FUNCTION(udf_to_json, prefix + "to_json"); } } // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/registration/RegisterSpecialForm.cpp b/velox/functions/sparksql/registration/RegisterSpecialForm.cpp index bccd303c23af..d9f12abe4f80 100644 --- a/velox/functions/sparksql/registration/RegisterSpecialForm.cpp +++ b/velox/functions/sparksql/registration/RegisterSpecialForm.cpp @@ -20,7 +20,6 @@ #include "velox/functions/sparksql/specialforms/DecimalRound.h" #include "velox/functions/sparksql/specialforms/MakeDecimal.h" #include "velox/functions/sparksql/specialforms/SparkCastExpr.h" -#include "velox/functions/sparksql/specialforms/ToJson.h" namespace facebook::velox::functions { void registerSparkSpecialFormFunctions() { @@ -45,9 +44,6 @@ void registerSpecialFormGeneralFunctions(const std::string& prefix) { "cast", std::make_unique()); registerFunctionCallToSpecialForm( "try_cast", std::make_unique()); - exec::registerFunctionCallToSpecialForm( - ToJsonCallToSpecialForm::kToJson, - std::make_unique()); } } // namespace sparksql } // namespace facebook::velox::functions diff --git a/velox/functions/sparksql/specialforms/CMakeLists.txt b/velox/functions/sparksql/specialforms/CMakeLists.txt index e0c5e741f3a1..e141e0074bc8 100644 --- a/velox/functions/sparksql/specialforms/CMakeLists.txt +++ b/velox/functions/sparksql/specialforms/CMakeLists.txt @@ -18,8 +18,7 @@ velox_add_library( DecimalRound.cpp MakeDecimal.cpp SparkCastExpr.cpp - SparkCastHooks.cpp - ToJson.cpp) + SparkCastHooks.cpp) velox_link_libraries(velox_functions_spark_specialforms fmt::fmt - velox_functions_json velox_expression) + velox_expression) diff --git a/velox/functions/sparksql/specialforms/ToJson.h b/velox/functions/sparksql/specialforms/ToJson.h deleted file mode 100644 index b4a946c05b59..000000000000 --- a/velox/functions/sparksql/specialforms/ToJson.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "velox/expression/FunctionCallToSpecialForm.h" - -namespace facebook::velox::functions::sparksql { - -class ToJsonCallToSpecialForm : public exec::FunctionCallToSpecialForm { - public: - - TypePtr resolveType(const std::vector& argTypes) override; - - exec::ExprPtr constructSpecialForm( - const TypePtr& type, - std::vector&& args, - bool trackCpuUsage, - const core::QueryConfig& config) override; - - static constexpr const char* kToJson = "to_json"; -}; -} // namespace facebook::velox::functions::sparksql From fc6eedd373c3279d7c576a217c038197c3fc7886 Mon Sep 17 00:00:00 2001 From: wecharyu Date: Thu, 27 Feb 2025 17:59:30 +0000 Subject: [PATCH 3/5] use buffer for string --- velox/docs/functions/spark/json.rst | 4 + velox/functions/sparksql/ToJson.cpp | 272 +++++++++++++----- velox/functions/sparksql/tests/ToJsonTest.cpp | 49 +++- 3 files changed, 244 insertions(+), 81 deletions(-) diff --git a/velox/docs/functions/spark/json.rst b/velox/docs/functions/spark/json.rst index 85697abe25b2..4f779e6691ce 100644 --- a/velox/docs/functions/spark/json.rst +++ b/velox/docs/functions/spark/json.rst @@ -52,6 +52,10 @@ JSON Functions * Does not support user provided options. :: + to_json(ROW(1, "a"), map('option', 'value')) + + Examples of valid inputs are listed as below. :: + SELECT to_json(ROW(1, "a")); -- {"a":1} SELECT to_json(ARRAY[1, 2, 3]); -- [1,2,3] SELECT to_json(MAP(ARRAY['x', 'y'], ARRAY[1, 2])); -- {"x":1,"y":2} diff --git a/velox/functions/sparksql/ToJson.cpp b/velox/functions/sparksql/ToJson.cpp index a4e9df94e9e6..fa9ac15a0876 100644 --- a/velox/functions/sparksql/ToJson.cpp +++ b/velox/functions/sparksql/ToJson.cpp @@ -16,76 +16,197 @@ #include "velox/functions/prestosql/types/JsonType.h" -#include #include -#include -#include -#include +#include #include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include namespace facebook::velox::functions::sparksql { namespace { -template -void generateJsonTyped( - const SimpleVector& input, - int row, - std::string& result, +template +std::enable_if_t, size_t> +append(T value, char* const buffer) { + const auto oute = buffer + folly::to_ascii_size_max_decimal + 1; + auto uvalue = value < 0 ? ~static_cast(value) + 1 : static_cast(value); + size_t p = 0; + char* writtenPosition = buffer; + if (value < 0) { + *writtenPosition++ = '-'; + p += 1; + }; + p += folly::to_ascii_decimal(writtenPosition, oute, uvalue); + return p; +} + +template +std::enable_if_t, size_t> +append(T value, char* const buffer) { + std::string result; + if (FOLLY_UNLIKELY(std::isinf(value) || std::isnan(value))) { + result = fmt::format( + "\"{}\"", + util::Converter::tryCast(value).value()); + } else { + result = util::Converter::tryCast(value).value(); + } + std::memcpy(buffer, result.c_str(), result.size()); + return result.size(); +} + +template +size_t convertToString( + T value, + char* const buffer, + exec::EvalCtx& context, + const TypePtr& type) { + VELOX_FAIL("{} is not supported in to_json.", type->toString()); +} + +template<> +size_t convertToString( + bool value, + char* const buffer, + exec::EvalCtx& context, + const TypePtr& type) { + static const char TRUE[] = "true"; + static const char FALSE[] = "false"; + char* pos = buffer; + const char* res = value ? TRUE : FALSE; + const size_t size = value ? 4 : 5; + std::memcpy(pos, res, size); + return size; +} + +template<> +size_t convertToString( + int8_t value, + char* const buffer, + exec::EvalCtx& context, + const TypePtr& type) { + return append(value, buffer); +} + +template<> +size_t convertToString( + int16_t value, + char* const buffer, + exec::EvalCtx& context, + const TypePtr& type) { + return append(value, buffer); +} + +template<> +size_t convertToString( + int32_t value, + char* const buffer, + exec::EvalCtx& context, const TypePtr& type) { - auto value = input.valueAt(row); - - if constexpr (std::is_same_v) { - size_t resultSize = escapedStringSize(value.data(), value.size()); - result.resize(resultSize + 2); - result.data()[0] = '"'; - escapeString(value.data(), value.size(), result.data() + 1); - result.data()[resultSize + 1] = '"'; - } else if constexpr (std::is_same_v) { - VELOX_FAIL( - "Convert UNKNOWN to JSON: Vextors of UNKNOWN type should not contain non-null rows"); + if (type->isDate()) { + std::string stringValue = DATE()->toString(value); + return snprintf(buffer, stringValue.size() + 3, "\"%s\"", stringValue.c_str()); } else { - if constexpr (std::is_same_v) { - result.append(value ? "true" : "false"); - } else if constexpr ( - std::is_same_v || std::is_same_v) { - if (FOLLY_UNLIKELY(std::isinf(value) || std::isnan(value))) { - result.append(fmt::format( - "\"{}\"", - util::Converter::tryCast(value).value())); - } else { - result.append( - util::Converter::tryCast(value).value()); - } - } else if constexpr (std::is_same_v) { - std::string stringValue = std::to_string(value); - result.reserve(stringValue.size() + 2); - result.append("\""); - result.append(stringValue); - result.append("\""); - } else if (type->isDate()) { - std::string stringValue = DATE()->toString(value); - result.reserve(stringValue.size() + 2); - result.append("\""); - result.append(stringValue); - result.append("\""); - } else if (type->isDecimal()) { - result.append(DecimalUtil::toString(value, type)); - } else { - folly::toAppend(value, &result); - } + return append(value, buffer); + } +} + +template<> +size_t convertToString( + int64_t value, + char* const buffer, + exec::EvalCtx& context, + const TypePtr& type) { + if (type->isDecimal()) { + auto [precision, scale] = getDecimalPrecisionScale(*type); + auto size = DecimalUtil::maxStringViewSize(precision, scale); + return DecimalUtil::castToString(value, scale, size, buffer); + } else { + return append(value, buffer); + } +} + +template<> +size_t convertToString( + int128_t value, + char* const buffer, + exec::EvalCtx& context, + const TypePtr& type) { + const auto oute = buffer + folly::detail::digitsEnough() + 1; + size_t p; + if (value < 0) { + *buffer = '-'; + p = 1 + folly::detail::unsafeTelescope128(buffer + 1, oute, -value); + } else { + p = folly::detail::unsafeTelescope128(buffer, oute, value); + } + return p; +} + +template<> +size_t convertToString( + float value, + char* const buffer, + exec::EvalCtx& context, + const TypePtr& type) { + return append(value, buffer); +} + +template<> +size_t convertToString( + double value, + char* const buffer, + exec::EvalCtx& context, + const TypePtr& type) { + return append(value, buffer); +} + +template<> +size_t convertToString( + StringView value, + char* const buffer, + exec::EvalCtx& context, + const TypePtr& type) { + size_t size = escapedStringSize(value.data(), value.size()); + *buffer = '"'; + escapeString(value.data(), size, buffer + 1); + *(buffer + size + 1) = '"'; + return size + 2; +} + +template<> +size_t convertToString( + Timestamp value, + char* const buffer, + exec::EvalCtx& context, + const TypePtr& type) { + // Spark converts Timestamp in ISO8601 format by default. + static const auto formatter = + functions::buildJodaDateTimeFormatter("yyyy-MM-dd'T'HH:mm:ss.SSSZZ").value(); + const auto* timeZone = + getTimeZoneFromConfig(context.execCtx()->queryCtx()->queryConfig()); + const auto maxResultSize = formatter->maxResultSize(timeZone); + *buffer = '"'; + const auto resultSize = formatter->format(value, timeZone, maxResultSize, buffer + 1, false, "Z"); + *(buffer + resultSize + 1) = '"'; + return resultSize + 2; +} + +template +size_t estimateRowSize(const TypePtr& type) { + if constexpr (std::is_same_v) { + return 5; + } else if constexpr (std::is_integral_v) { + return folly::detail::digitsEnough() + 1; + } else if constexpr (std::is_same_v) { + // yyyy-MM-dd'T'HH:mm:ss.SSSZZ + return 40; + } else if (type->isDate()) { + // yyyy-MM-dd. + return 12; + } else { + // For variable-length types, the initial size is set to 10. + return 10; } } @@ -103,17 +224,27 @@ void toJson( // input is guaranteed to be in flat or constant encodings when passed in. auto inputVector = input.as>(); - std::string result; + size_t rowSize = estimateRowSize(inputVector->type()); + Buffer* buffer = flatResult.getBufferWithSpace(rows.countSelected() * rowSize); + char* rawBuffer = buffer->asMutable() + buffer->size(); context.applyToSelectedNoThrow(rows, [&](auto row) { if (inputVector->isNullAt(row)) { flatResult.set(row, "null"); } else { - result.clear(); - generateJsonTyped(*inputVector, row, result, inputVector->type()); - - flatResult.set(row, StringView(result)); + auto size = VELOX_DYNAMIC_TYPE_DISPATCH( + convertToString, + kind, + inputVector->valueAt(row), + rawBuffer, + context, + inputVector->type()); + + flatResult.setNoCopy(row, StringView(rawBuffer, size)); + rawBuffer += size; } }); + // Update the exact buffer size. + buffer->setSize(rawBuffer - buffer->asMutable()); } // Forward declaration. @@ -298,7 +429,7 @@ void toJsonFromRow( }); } - // Extra length for commas and brackets + // Extra length for commas and brackets. childrenStringSize += rows.countSelected() * (childrenSize > 0 ? childrenSize + 1 : 2); flatResult.getBufferWithSpace(childrenStringSize); @@ -512,12 +643,17 @@ class ToJsonFunction final : public exec::VectorFunction { const TypePtr& outputType, exec::EvalCtx& context, VectorPtr& result) const final { + VELOX_USER_CHECK_EQ(args.size(), 1, "to_json takes one argument."); + auto kind = args[0]->typeKind(); + VELOX_USER_CHECK( + kind == TypeKind::ROW || kind == TypeKind::ARRAY || kind == TypeKind::MAP, + "to_json only support ROW/ARRAY/MAP inputs."); context.ensureWritable(rows, outputType, result); result->clearNulls(rows); auto* rawResults = result->as>(); VELOX_DYNAMIC_TYPE_DISPATCH_ALL( - toJson, args[0]->typeKind(), *args[0], context, rows, *rawResults); + toJson, kind, *args[0], context, rows, *rawResults); } static std::vector> signatures() { diff --git a/velox/functions/sparksql/tests/ToJsonTest.cpp b/velox/functions/sparksql/tests/ToJsonTest.cpp index 036d232b4127..00fc2110f2d1 100644 --- a/velox/functions/sparksql/tests/ToJsonTest.cpp +++ b/velox/functions/sparksql/tests/ToJsonTest.cpp @@ -13,15 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "velox/common/base/tests/GTestUtils.h" -#include -#include #include -#include -#include -#include -#include -#include using namespace facebook::velox::test; @@ -77,6 +69,18 @@ TEST_F(ToJsonTest, basicBool) { testToJson(input, expected); } +TEST_F(ToJsonTest, basicString) { + auto data = makeNullableFlatVector({"str1", "str2", std::nullopt, "str\"3\"", std::nullopt}); + auto input = makeRowVector({"a"}, {data}); + auto expected = makeFlatVector( + {R"({"a":"str1"})", + R"({"a":"str2"})", + R"({"a":null})", + R"({"a":"str\"3\""})", + R"({"a":null})"}); + testToJson(input, expected); +} + TEST_F(ToJsonTest, basicTinyInt) { auto data = makeNullableFlatVector({0, 127, 128, -128, -129, std::nullopt}); auto input = makeRowVector({"a"}, {data}); @@ -138,19 +142,38 @@ TEST_F(ToJsonTest, basicDouble) { testToJson(input, expected); } +TEST_F(ToJsonTest, basicDecimal) { + auto data = makeNullableFlatVector( + {12345, 0, -67890, std::nullopt}, DECIMAL(10, 2)); + auto input = makeRowVector({"a"}, {data}); + auto expected = makeFlatVector( + {R"({"a":123.45})", + R"({"a":0.00})", + R"({"a":-678.90})", + R"({"a":null})"}); + testToJson(input, expected); +} + TEST_F(ToJsonTest, basicTimestamp) { auto data = makeNullableFlatVector( {Timestamp(0, 0), Timestamp(1582934400, 0), Timestamp(-2208988800, 0), - Timestamp(253402300799, 0), std::nullopt}); auto input = makeRowVector({"a"}, {data}); + // UTC time zone. auto expected = makeFlatVector( - {R"({"a":"1970-01-01T00:00:00.000000000"})", - R"({"a":"2020-02-29T00:00:00.000000000"})", - R"({"a":"1900-01-01T00:00:00.000000000"})", - R"({"a":"9999-12-31T23:59:59.000000000"})", + {R"({"a":"1970-01-01T00:00:00.000Z"})", + R"({"a":"2020-02-29T00:00:00.000Z"})", + R"({"a":"1900-01-01T00:00:00.000Z"})", + R"({"a":null})"}); + testToJson(input, expected); + // Los_Angeles time zone. + setTimezone("America/Los_Angeles"); + expected = makeFlatVector( + {R"({"a":"1969-12-31T16:00:00.000-08:00"})", + R"({"a":"2020-02-28T16:00:00.000-08:00"})", + R"({"a":"1899-12-31T16:00:00.000-08:00"})", R"({"a":null})"}); testToJson(input, expected); } From 883429016a108e01efed14e307725becbc94e076 Mon Sep 17 00:00:00 2001 From: wecharyu Date: Fri, 28 Feb 2025 02:59:43 +0000 Subject: [PATCH 4/5] fix conflicts --- velox/functions/sparksql/ToJson.cpp | 14 ++++++++------ .../sparksql/registration/RegisterJson.cpp | 1 - 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/velox/functions/sparksql/ToJson.cpp b/velox/functions/sparksql/ToJson.cpp index fa9ac15a0876..6c416f8cb643 100644 --- a/velox/functions/sparksql/ToJson.cpp +++ b/velox/functions/sparksql/ToJson.cpp @@ -17,6 +17,8 @@ #include "velox/functions/prestosql/types/JsonType.h" #include +#include +#include #include #include #include @@ -167,9 +169,9 @@ size_t convertToString( char* const buffer, exec::EvalCtx& context, const TypePtr& type) { - size_t size = escapedStringSize(value.data(), value.size()); + size_t size = normalizedSizeForJsonCast(value.data(), value.size()); *buffer = '"'; - escapeString(value.data(), size, buffer + 1); + normalizeForJsonCast(value.data(), size, buffer + 1); *(buffer + size + 1) = '"'; return size + 2; } @@ -347,7 +349,7 @@ struct AsJson { } // Appends the json string of the value at i to a string writer. - void append(vector_size_t i, exec::StringWriter<>& proxy) const { + void append(vector_size_t i, exec::StringWriter& proxy) const { if (decoded_->isNullAt(i)) { proxy.append("null"); } else { @@ -441,7 +443,7 @@ void toJsonFromRow( return; } - auto proxy = exec::StringWriter<>(&flatResult, row); + auto proxy = exec::StringWriter(&flatResult, row); proxy.append("{"_sv); for (int i = 0; i < childrenSize; ++i) { @@ -525,7 +527,7 @@ void toJsonFromArray( auto offset = inputArray->offsetAt(row); auto size = inputArray->sizeAt(row); - auto proxy = exec::StringWriter<>(&flatResult, row); + auto proxy = exec::StringWriter(&flatResult, row); proxy.append("["_sv); for (int i = offset, end = offset + size; i < end; ++i) { @@ -618,7 +620,7 @@ void toJsonFromMap( } std::sort(sortedKeys.begin(), sortedKeys.end()); - auto proxy = exec::StringWriter<>(&flatResult, row); + auto proxy = exec::StringWriter(&flatResult, row); proxy.append("{"_sv); for (auto it = sortedKeys.begin(); it != sortedKeys.end(); ++it) { diff --git a/velox/functions/sparksql/registration/RegisterJson.cpp b/velox/functions/sparksql/registration/RegisterJson.cpp index 6a12b60468dd..9cb69a164780 100644 --- a/velox/functions/sparksql/registration/RegisterJson.cpp +++ b/velox/functions/sparksql/registration/RegisterJson.cpp @@ -18,7 +18,6 @@ #include "velox/functions/lib/RegistrationHelpers.h" #include "velox/functions/sparksql/GetJsonObject.h" #include "velox/functions/sparksql/JsonObjectKeys.h" -#include namespace facebook::velox::functions::sparksql { From d4682707853e1568a227315f2b8095d9f50c113c Mon Sep 17 00:00:00 2001 From: wecharyu Date: Fri, 28 Feb 2025 03:44:52 +0000 Subject: [PATCH 5/5] fix code format --- velox/functions/sparksql/ToJson.cpp | 82 +++++++++---------- velox/functions/sparksql/tests/ToJsonTest.cpp | 37 ++++----- 2 files changed, 58 insertions(+), 61 deletions(-) diff --git a/velox/functions/sparksql/ToJson.cpp b/velox/functions/sparksql/ToJson.cpp index 6c416f8cb643..3139c4170a45 100644 --- a/velox/functions/sparksql/ToJson.cpp +++ b/velox/functions/sparksql/ToJson.cpp @@ -27,11 +27,13 @@ namespace facebook::velox::functions::sparksql { namespace { -template -std::enable_if_t, size_t> -append(T value, char* const buffer) { +template +std::enable_if_t, size_t> append( + T value, + char* const buffer) { const auto oute = buffer + folly::to_ascii_size_max_decimal + 1; - auto uvalue = value < 0 ? ~static_cast(value) + 1 : static_cast(value); + auto uvalue = value < 0 ? ~static_cast(value) + 1 + : static_cast(value); size_t p = 0; char* writtenPosition = buffer; if (value < 0) { @@ -42,14 +44,14 @@ append(T value, char* const buffer) { return p; } -template -std::enable_if_t, size_t> -append(T value, char* const buffer) { +template +std::enable_if_t, size_t> append( + T value, + char* const buffer) { std::string result; if (FOLLY_UNLIKELY(std::isinf(value) || std::isnan(value))) { result = fmt::format( - "\"{}\"", - util::Converter::tryCast(value).value()); + "\"{}\"", util::Converter::tryCast(value).value()); } else { result = util::Converter::tryCast(value).value(); } @@ -57,7 +59,7 @@ append(T value, char* const buffer) { return result.size(); } -template +template size_t convertToString( T value, char* const buffer, @@ -66,7 +68,7 @@ size_t convertToString( VELOX_FAIL("{} is not supported in to_json.", type->toString()); } -template<> +template <> size_t convertToString( bool value, char* const buffer, @@ -81,7 +83,7 @@ size_t convertToString( return size; } -template<> +template <> size_t convertToString( int8_t value, char* const buffer, @@ -90,7 +92,7 @@ size_t convertToString( return append(value, buffer); } -template<> +template <> size_t convertToString( int16_t value, char* const buffer, @@ -99,7 +101,7 @@ size_t convertToString( return append(value, buffer); } -template<> +template <> size_t convertToString( int32_t value, char* const buffer, @@ -107,13 +109,14 @@ size_t convertToString( const TypePtr& type) { if (type->isDate()) { std::string stringValue = DATE()->toString(value); - return snprintf(buffer, stringValue.size() + 3, "\"%s\"", stringValue.c_str()); + return snprintf( + buffer, stringValue.size() + 3, "\"%s\"", stringValue.c_str()); } else { return append(value, buffer); } } -template<> +template <> size_t convertToString( int64_t value, char* const buffer, @@ -128,7 +131,7 @@ size_t convertToString( } } -template<> +template <> size_t convertToString( int128_t value, char* const buffer, @@ -145,7 +148,7 @@ size_t convertToString( return p; } -template<> +template <> size_t convertToString( float value, char* const buffer, @@ -154,7 +157,7 @@ size_t convertToString( return append(value, buffer); } -template<> +template <> size_t convertToString( double value, char* const buffer, @@ -163,7 +166,7 @@ size_t convertToString( return append(value, buffer); } -template<> +template <> size_t convertToString( StringView value, char* const buffer, @@ -176,7 +179,7 @@ size_t convertToString( return size + 2; } -template<> +template <> size_t convertToString( Timestamp value, char* const buffer, @@ -184,12 +187,14 @@ size_t convertToString( const TypePtr& type) { // Spark converts Timestamp in ISO8601 format by default. static const auto formatter = - functions::buildJodaDateTimeFormatter("yyyy-MM-dd'T'HH:mm:ss.SSSZZ").value(); + functions::buildJodaDateTimeFormatter("yyyy-MM-dd'T'HH:mm:ss.SSSZZ") + .value(); const auto* timeZone = getTimeZoneFromConfig(context.execCtx()->queryCtx()->queryConfig()); const auto maxResultSize = formatter->maxResultSize(timeZone); *buffer = '"'; - const auto resultSize = formatter->format(value, timeZone, maxResultSize, buffer + 1, false, "Z"); + const auto resultSize = + formatter->format(value, timeZone, maxResultSize, buffer + 1, false, "Z"); *(buffer + resultSize + 1) = '"'; return resultSize + 2; } @@ -203,12 +208,12 @@ size_t estimateRowSize(const TypePtr& type) { } else if constexpr (std::is_same_v) { // yyyy-MM-dd'T'HH:mm:ss.SSSZZ return 40; - } else if (type->isDate()) { + } else if (type->isDate()) { // yyyy-MM-dd. return 12; } else { // For variable-length types, the initial size is set to 10. - return 10; + return 10; } } @@ -227,7 +232,8 @@ void toJson( auto inputVector = input.as>(); size_t rowSize = estimateRowSize(inputVector->type()); - Buffer* buffer = flatResult.getBufferWithSpace(rows.countSelected() * rowSize); + Buffer* buffer = + flatResult.getBufferWithSpace(rows.countSelected() * rowSize); char* rawBuffer = buffer->asMutable() + buffer->size(); context.applyToSelectedNoThrow(rows, [&](auto row) { if (inputVector->isNullAt(row)) { @@ -306,7 +312,7 @@ struct AsJson { if (!exec::PeeledEncoding::isPeelable(input->encoding())) { serialize(context, input, rows, json_); } else { - exec::withContextSaver([&](exec::ContextSaver& saver){ + exec::withContextSaver([&](exec::ContextSaver& saver) { exec::LocalSelectivityVector newRowsHodler(*context.execCtx()); exec::LocalDecodedVector localDecoded(context); @@ -367,12 +373,7 @@ struct AsJson { auto flatJsonStrings = result->as>(); VELOX_DYNAMIC_TYPE_DISPATCH_ALL( - toJson, - input->typeKind(), - *input, - context, - baseRows, - *flatJsonStrings); + toJson, input->typeKind(), *input, context, baseRows, *flatJsonStrings); } // Combine exceptions in oldErrors into context.errors_ with a transformation @@ -492,8 +493,7 @@ void toJsonFromArray( auto elementToTopLevelRows = functions::getElementToTopLevelRows( elements->size(), rows, inputArray, context.pool()); - AsJson elementsAsJson{ - context, elements, elementsRows, elementToTopLevelRows}; + AsJson elementsAsJson{context, elements, elementsRows, elementToTopLevelRows}; // Estimates an upperbound of the total length of all Json strings for the // input according to the length of all elements Json strings and the @@ -573,10 +573,8 @@ void toJsonFromMap( auto elementToTopLevelRows = functions::getElementToTopLevelRows( mapKeys->size(), rows, inputMap, context.pool()); - AsJson keysAsJson{ - context, mapKeys, elementsRows, elementToTopLevelRows}; - AsJson valuesAsJson{ - context, mapValues, elementsRows, elementToTopLevelRows}; + AsJson keysAsJson{context, mapKeys, elementsRows, elementToTopLevelRows}; + AsJson valuesAsJson{context, mapValues, elementsRows, elementToTopLevelRows}; // Estimates an upperbound of the total length of all Json strings for the // input according to the length of all elements Json strings and the @@ -627,7 +625,8 @@ void toJsonFromMap( if (it != sortedKeys.begin()) { proxy.append(","_sv); } - std::string keyFormat = mapType.childAt(0)->isVarchar() ? "{}:" : "\"{}\":"; + std::string keyFormat = + mapType.childAt(0)->isVarchar() ? "{}:" : "\"{}\":"; proxy.append(fmt::format(keyFormat, it->first)); valuesAsJson.append(it->second, proxy); } @@ -648,7 +647,8 @@ class ToJsonFunction final : public exec::VectorFunction { VELOX_USER_CHECK_EQ(args.size(), 1, "to_json takes one argument."); auto kind = args[0]->typeKind(); VELOX_USER_CHECK( - kind == TypeKind::ROW || kind == TypeKind::ARRAY || kind == TypeKind::MAP, + kind == TypeKind::ROW || kind == TypeKind::ARRAY || + kind == TypeKind::MAP, "to_json only support ROW/ARRAY/MAP inputs."); context.ensureWritable(rows, outputType, result); result->clearNulls(rows); diff --git a/velox/functions/sparksql/tests/ToJsonTest.cpp b/velox/functions/sparksql/tests/ToJsonTest.cpp index 00fc2110f2d1..1bd527919656 100644 --- a/velox/functions/sparksql/tests/ToJsonTest.cpp +++ b/velox/functions/sparksql/tests/ToJsonTest.cpp @@ -28,7 +28,7 @@ class ToJsonTest : public SparkFunctionBaseTest { protected: core::CallTypedExprPtr createToJson(const TypePtr& inputType) { std::vector inputs = { - std::make_shared(inputType, "c0")}; + std::make_shared(inputType, "c0")}; return std::make_shared( VARCHAR(), std::move(inputs), "to_json"); } @@ -41,8 +41,8 @@ class ToJsonTest : public SparkFunctionBaseTest { TEST_F(ToJsonTest, basicStruct) { auto input = makeRowVector({"a"}, {makeFlatVector({1, 2, 3})}); - auto expected = makeFlatVector( - {R"({"a":1})", R"({"a":2})", R"({"a":3})"}); + auto expected = + makeFlatVector({R"({"a":1})", R"({"a":2})", R"({"a":3})"}); testToJson(input, expected); } @@ -53,16 +53,15 @@ TEST_F(ToJsonTest, basicArray) { } TEST_F(ToJsonTest, basicMap) { - auto input = makeMapVector( - {{{"a", 1}}, {{"b", 2}}, {{"c", 3}}}); - auto expected = makeFlatVector( - {R"({"a":1})", R"({"b":2})", R"({"c":3})"}); + auto input = + makeMapVector({{{"a", 1}}, {{"b", 2}}, {{"c", 3}}}); + auto expected = + makeFlatVector({R"({"a":1})", R"({"b":2})", R"({"c":3})"}); testToJson(input, expected); } TEST_F(ToJsonTest, basicBool) { - auto data = makeNullableFlatVector( - {true, false, std::nullopt}); + auto data = makeNullableFlatVector({true, false, std::nullopt}); auto input = makeRowVector({"a"}, {data}); auto expected = makeFlatVector( {R"({"a":true})", R"({"a":false})", R"({"a":null})"}); @@ -70,7 +69,8 @@ TEST_F(ToJsonTest, basicBool) { } TEST_F(ToJsonTest, basicString) { - auto data = makeNullableFlatVector({"str1", "str2", std::nullopt, "str\"3\"", std::nullopt}); + auto data = makeNullableFlatVector( + {"str1", "str2", std::nullopt, "str\"3\"", std::nullopt}); auto input = makeRowVector({"a"}, {data}); auto expected = makeFlatVector( {R"({"a":"str1"})", @@ -82,7 +82,8 @@ TEST_F(ToJsonTest, basicString) { } TEST_F(ToJsonTest, basicTinyInt) { - auto data = makeNullableFlatVector({0, 127, 128, -128, -129, std::nullopt}); + auto data = + makeNullableFlatVector({0, 127, 128, -128, -129, std::nullopt}); auto input = makeRowVector({"a"}, {data}); auto expected = makeFlatVector( {R"({"a":0})", @@ -98,15 +99,13 @@ TEST_F(ToJsonTest, basicSmallInt) { auto data = makeNullableFlatVector({0, 32768, -32769, std::nullopt}); auto input = makeRowVector({"a"}, {data}); auto expected = makeFlatVector( - {R"({"a":0})", - R"({"a":-32768})", - R"({"a":32767})", - R"({"a":null})"}); + {R"({"a":0})", R"({"a":-32768})", R"({"a":32767})", R"({"a":null})"}); testToJson(input, expected); } TEST_F(ToJsonTest, basicInt) { - auto data = makeNullableFlatVector({0, 2147483648, -2147483649, std::nullopt}); + auto data = makeNullableFlatVector( + {0, 2147483648, -2147483649, std::nullopt}); auto input = makeRowVector({"a"}, {data}); auto expected = makeFlatVector( {R"({"a":0})", @@ -193,10 +192,8 @@ TEST_F(ToJsonTest, basicDate) { TEST_F(ToJsonTest, nestedComplexType) { auto data1 = makeNullableFlatVector({"str1", "str2", "str3"}); - auto data2 = makeNullableArrayVector({ - {1, 2, 3}, - {}, - {std::nullopt}}); + auto data2 = + makeNullableArrayVector({{1, 2, 3}, {}, {std::nullopt}}); auto data3 = makeMapVector( {{{"key1", 1}}, {{"key2", 2}}, {{"key3", 3}}}); auto input = makeRowVector({"a", "b", "c"}, {data1, data2, data3});