From a7c62b5d6713b6b901d3ed44847726a766c2f144 Mon Sep 17 00:00:00 2001 From: PHILO-HE Date: Fri, 26 May 2023 09:12:29 +0800 Subject: [PATCH] Fix semantic issues in cast function (#280) --- velox/expression/tests/CastExprTest.cpp | 17 +++++++++++ .../lib/tests/DateTimeFormatterTest.cpp | 20 ++++++++----- velox/type/Conversions.h | 29 +++++++++++++++---- velox/type/TimestampConversion.cpp | 8 ++++- velox/type/tests/TimestampConversionTest.cpp | 3 +- 5 files changed, 62 insertions(+), 15 deletions(-) diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp index 4597e0ec0c83a..9a91df6e1102e 100644 --- a/velox/expression/tests/CastExprTest.cpp +++ b/velox/expression/tests/CastExprTest.cpp @@ -421,6 +421,16 @@ TEST_F(CastExprTest, date) { setCastIntByTruncate(true); testCast("date", input, result); + + // Wrong date format case. + std::vector> inputWrongFormat{ + "1970-01/01", "2023/05/10", "2023-/05-/10", "20150318"}; + std::vector> nullResult{ + std::nullopt, std::nullopt, std::nullopt, std::nullopt}; + testCast( + "date", inputWrongFormat, nullResult, false, true); + testCast( + "date", inputWrongFormat, nullResult, true, false); } TEST_F(CastExprTest, invalidDate) { @@ -552,6 +562,13 @@ TEST_F(CastExprTest, allowDecimal) { "int", {"-.", "0.0", "125.5", "-128.3"}, {0, 0, 125, -128}, false, true); } +TEST_F(CastExprTest, sparkSemantic) { + // Allow decimal. + setCastIntAllowDecimalAndByTruncate(true); + testCast( + "bool", {0.5, -0.5, 1, 0}, {true, true, true, false}, false, true); +} + constexpr vector_size_t kVectorSize = 1'000; TEST_F(CastExprTest, mapCast) { diff --git a/velox/functions/lib/tests/DateTimeFormatterTest.cpp b/velox/functions/lib/tests/DateTimeFormatterTest.cpp index 659164f91693f..950ffb4844700 100644 --- a/velox/functions/lib/tests/DateTimeFormatterTest.cpp +++ b/velox/functions/lib/tests/DateTimeFormatterTest.cpp @@ -547,11 +547,13 @@ TEST_F(JodaDateTimeFormatterTest, parseYear) { EXPECT_THROW(parseJoda("++100", "y"), VeloxUserError); // Probe the year range - EXPECT_THROW(parseJoda("-292275056", "y"), VeloxUserError); - EXPECT_THROW(parseJoda("292278995", "y"), VeloxUserError); - EXPECT_EQ( - util::fromTimestampString("292278994-01-01"), - parseJoda("292278994", "y").timestamp); + // Temporarily removed for adapting to spark semantic (not allowed year digits + // larger than 7). + // EXPECT_THROW(parseJoda("-292275056", "y"), VeloxUserError); + // EXPECT_THROW(parseJoda("292278995", "y"), VeloxUserError); + // EXPECT_EQ( + // util::fromTimestampString("292278994-01-01"), + // parseJoda("292278994", "y").timestamp); } TEST_F(JodaDateTimeFormatterTest, parseWeekYear) { @@ -626,9 +628,11 @@ TEST_F(JodaDateTimeFormatterTest, parseWeekYear) { TEST_F(JodaDateTimeFormatterTest, parseCenturyOfEra) { // Probe century range - EXPECT_EQ( - util::fromTimestampString("292278900-01-01 00:00:00"), - parseJoda("2922789", "CCCCCCC").timestamp); + // Temporarily removed for adapting to spark semantic (not allowed year digits + // larger than 7). + // EXPECT_EQ( + // util::fromTimestampString("292278900-01-01 00:00:00"), + // parseJoda("2922789", "CCCCCCC").timestamp); EXPECT_EQ( util::fromTimestampString("00-01-01 00:00:00"), parseJoda("0", "C").timestamp); diff --git a/velox/type/Conversions.h b/velox/type/Conversions.h index 5ff4ecec5a833..adee086faccc4 100644 --- a/velox/type/Conversions.h +++ b/velox/type/Conversions.h @@ -301,7 +301,9 @@ struct Converter< if (v > LimitType::maxLimit()) { return LimitType::max(); } - if (v < LimitType::minLimit()) { + // bool type's min is 0, but spark expects true for casting negative float + // data. + if (!std::is_same_v && v < LimitType::minLimit()) { return LimitType::min(); } return LimitType::cast(v); @@ -321,7 +323,9 @@ struct Converter< if (v > LimitType::maxLimit()) { return LimitType::max(); } - if (v < LimitType::minLimit()) { + // bool type's min is 0, but spark expects true for casting negative float + // data. + if (!std::is_same_v && v < LimitType::minLimit()) { return LimitType::min(); } return LimitType::cast(v); @@ -596,15 +600,30 @@ struct Converter { } static T cast(folly::StringPiece v, bool& nullOutput) { - return fromDateString(v.data(), v.size()); + try { + return fromDateString(v.data(), v.size()); + } catch (const VeloxUserError& ve) { + nullOutput = true; + return (T)0; + } } static T cast(const StringView& v, bool& nullOutput) { - return fromDateString(v.data(), v.size()); + try { + return fromDateString(v.data(), v.size()); + } catch (const VeloxUserError& ve) { + nullOutput = true; + return (T)0; + } } static T cast(const std::string& v, bool& nullOutput) { - return fromDateString(v.data(), v.size()); + try { + return fromDateString(v.data(), v.size()); + } catch (const VeloxUserError& ve) { + nullOutput = true; + return (T)0; + } } static T cast(const Timestamp& t, bool& nullOutput) { diff --git a/velox/type/TimestampConversion.cpp b/velox/type/TimestampConversion.cpp index 590a1e1d347f7..a657a250110ee 100644 --- a/velox/type/TimestampConversion.cpp +++ b/velox/type/TimestampConversion.cpp @@ -177,12 +177,17 @@ bool tryParseDateString( if (!characterIsDigit(buf[pos])) { return false; } + int yearSegStart = pos; // First parse the year. for (; pos < len && characterIsDigit(buf[pos]); pos++) { year = checkedPlus((buf[pos] - '0'), checkedMultiply(year, 10)); if (year > kMaxYear) { break; } + // Align with spark, year digits should not be greater than 7. + if (pos - yearSegStart + 1 > 7) { + return false; + } } if (yearneg) { year = checkedNegate(year); @@ -203,7 +208,8 @@ bool tryParseDateString( // Fetch the separator. sep = buf[pos++]; - if (sep != ' ' && sep != '-' && sep != '/' && sep != '\\') { + // For spark, "/" separtor is not supported. + if (sep != ' ' && sep != '-' && sep != '\\') { // Invalid separator. return false; } diff --git a/velox/type/tests/TimestampConversionTest.cpp b/velox/type/tests/TimestampConversionTest.cpp index 5994875eade77..edad734549d68 100644 --- a/velox/type/tests/TimestampConversionTest.cpp +++ b/velox/type/tests/TimestampConversionTest.cpp @@ -83,7 +83,8 @@ TEST(DateTimeUtilTest, fromDateString) { EXPECT_EQ(-719162, fromDateString(" \t \n 00001-1-1 \n")); // Different separators. - EXPECT_EQ(-719162, fromDateString("1/1/1")); + // Illegal date format for spark. + // EXPECT_EQ(-719162, fromDateString("1/1/1")); EXPECT_EQ(-719162, fromDateString("1 1 1")); EXPECT_EQ(-719162, fromDateString("1\\1\\1"));