diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index a6e96003ec34d..cf3b5c86dcf69 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -351,9 +351,8 @@ public static int lowercaseIndexOf(final UTF8String target, final UTF8String pat public static int indexOf(final UTF8String target, final UTF8String pattern, final int start, final int collationId) { - if (pattern.numBytes() == 0) { - return target.indexOfEmpty(start); - } + if (pattern.numBytes() == 0) return target.indexOfEmpty(start); + if (target.numBytes() == 0) return MATCH_NOT_FOUND; StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId); stringSearch.setIndex(start); diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index f7e6f76199cee..272bf5ab3e9c2 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -671,7 +671,7 @@ protected Collation buildCollation() { (s1, s2) -> collator.compare(s1.toString(), s2.toString()), ICU_COLLATOR_VERSION, s -> (long) collator.getCollationKey(s.toString()).hashCode(), - /* supportsBinaryEquality = */ collationId == UNICODE_COLLATION_ID, + /* supportsBinaryEquality = */ false, /* supportsBinaryOrdering = */ false, /* supportsLowercaseEquality = */ false); } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index d5045721f941e..b47f95ad7c299 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -631,6 +631,8 @@ public void testStringInstr() throws SparkException { assertStringInstr("", "xxxx", "UNICODE", 0); assertStringInstr("test大千世界X大千世界", "界x", "UNICODE", 0); assertStringInstr("test大千世界X大千世界", "界X", "UNICODE", 8); + assertStringInstr("xxxx", "", "UNICODE_CI", 1); + assertStringInstr("", "xxxx", "UNICODE_CI", 0); assertStringInstr("aaads", "AD", "UNICODE_CI", 3); assertStringInstr("aaads", "dS", "UNICODE_CI", 4); assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0); @@ -1040,20 +1042,6 @@ public void testStringTrim() throws SparkException { assertStringTrim("UTF8_BINARY_LCASE", "xxasdxx", "x", "asd"); assertStringTrim("UTF8_BINARY_LCASE", "xa世ax", "x", "a世a"); - assertStringTrimLeft("UNICODE", "asd", null, "asd"); - assertStringTrimLeft("UNICODE", " asd ", null, "asd "); - assertStringTrimLeft("UNICODE", " a世a ", null, "a世a "); - assertStringTrimLeft("UNICODE", "asd", "x", "asd"); - assertStringTrimLeft("UNICODE", "xxasdxx", "x", "asdxx"); - assertStringTrimLeft("UNICODE", "xa世ax", "x", "a世ax"); - - assertStringTrimRight("UNICODE", "asd", null, "asd"); - assertStringTrimRight("UNICODE", " asd ", null, " asd"); - assertStringTrimRight("UNICODE", " a世a ", null, " a世a"); - assertStringTrimRight("UNICODE", "asd", "x", "asd"); - assertStringTrimRight("UNICODE", "xxasdxx", "x", "xxasd"); - assertStringTrimRight("UNICODE", "xa世ax", "x", "xa世a"); - // Test cases where trimString has more than one character assertStringTrim("UTF8_BINARY", "ddsXXXaa", "asd", "XXX"); assertStringTrimLeft("UTF8_BINARY", "ddsXXXaa", "asd", "XXXaa"); @@ -1063,22 +1051,14 @@ public void testStringTrim() throws SparkException { assertStringTrimLeft("UTF8_BINARY_LCASE", "ddsXXXaa", "asd", "XXXaa"); assertStringTrimRight("UTF8_BINARY_LCASE", "ddsXXXaa", "asd", "ddsXXX"); - assertStringTrim("UNICODE", "ddsXXXaa", "asd", "XXX"); - assertStringTrimLeft("UNICODE", "ddsXXXaa", "asd", "XXXaa"); - assertStringTrimRight("UNICODE", "ddsXXXaa", "asd", "ddsXXX"); - // Test cases specific to collation type // uppercase trim, lowercase src assertStringTrim("UTF8_BINARY", "asd", "A", "asd"); assertStringTrim("UTF8_BINARY_LCASE", "asd", "A", "sd"); - assertStringTrim("UNICODE", "asd", "A", "asd"); - assertStringTrim("UNICODE_CI", "asd", "A", "sd"); // lowercase trim, uppercase src assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD"); assertStringTrim("UTF8_BINARY_LCASE", "ASD", "a", "SD"); - assertStringTrim("UNICODE", "ASD", "a", "ASD"); - assertStringTrim("UNICODE_CI", "ASD", "a", "SD"); // uppercase and lowercase chars of different byte-length (utf8) assertStringTrim("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); @@ -1089,10 +1069,6 @@ public void testStringTrim() throws SparkException { assertStringTrimLeft("UTF8_BINARY_LCASE", "ẞaaaẞ", "ß", "aaaẞ"); assertStringTrimRight("UTF8_BINARY_LCASE", "ẞaaaẞ", "ß", "ẞaaa"); - assertStringTrim("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); - assertStringTrimLeft("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); - assertStringTrimRight("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); - assertStringTrim("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); assertStringTrimLeft("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); assertStringTrimRight("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); @@ -1101,10 +1077,6 @@ public void testStringTrim() throws SparkException { assertStringTrimLeft("UTF8_BINARY_LCASE", "ßaaaß", "ẞ", "aaaß"); assertStringTrimRight("UTF8_BINARY_LCASE", "ßaaaß", "ẞ", "ßaaa"); - assertStringTrim("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); - assertStringTrimLeft("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); - assertStringTrimRight("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); - // different byte-length (utf8) chars trimmed assertStringTrim("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaa"); assertStringTrimLeft("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaaẞ"); @@ -1113,10 +1085,6 @@ public void testStringTrim() throws SparkException { assertStringTrim("UTF8_BINARY_LCASE", "Ëaaaẞ", "Ëẞ", "aaa"); assertStringTrimLeft("UTF8_BINARY_LCASE", "Ëaaaẞ", "Ëẞ", "aaaẞ"); assertStringTrimRight("UTF8_BINARY_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa"); - - assertStringTrim("UNICODE", "Ëaaaẞ", "Ëẞ", "aaa"); - assertStringTrimLeft("UNICODE", "Ëaaaẞ", "Ëẞ", "aaaẞ"); - assertStringTrimRight("UNICODE", "Ëaaaẞ", "Ëẞ", "Ëaaa"); } // TODO: Test more collation-aware string expressions. diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala index 69104dea0e992..35a40ba9f398c 100644 --- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala +++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala @@ -48,7 +48,7 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig assert(UNICODE_COLLATION_ID == (1 << 29)) val unicode = fetchCollation(UNICODE_COLLATION_ID) assert(unicode.collationName == "UNICODE") - assert(unicode.supportsBinaryEquality) + assert(!unicode.supportsBinaryEquality) assert(UNICODE_CI_COLLATION_ID == ((1 << 29) | (1 << 17))) val unicodeCi = fetchCollation(UNICODE_CI_COLLATION_ID) @@ -131,18 +131,24 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig CollationTestCase("UTF8_BINARY", "aaa", "aaa", true), CollationTestCase("UTF8_BINARY", "aaa", "AAA", false), CollationTestCase("UTF8_BINARY", "aaa", "bbb", false), + CollationTestCase("UTF8_BINARY", "å", "a\u030A", false), CollationTestCase("UTF8_BINARY_LCASE", "aaa", "aaa", true), CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AAA", true), CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AaA", true), CollationTestCase("UTF8_BINARY_LCASE", "aaa", "AaA", true), CollationTestCase("UTF8_BINARY_LCASE", "aaa", "aa", false), CollationTestCase("UTF8_BINARY_LCASE", "aaa", "bbb", false), + CollationTestCase("UTF8_BINARY_LCASE", "å", "a\u030A", false), CollationTestCase("UNICODE", "aaa", "aaa", true), CollationTestCase("UNICODE", "aaa", "AAA", false), CollationTestCase("UNICODE", "aaa", "bbb", false), + CollationTestCase("UNICODE", "å", "a\u030A", true), CollationTestCase("UNICODE_CI", "aaa", "aaa", true), CollationTestCase("UNICODE_CI", "aaa", "AAA", true), - CollationTestCase("UNICODE_CI", "aaa", "bbb", false)) + CollationTestCase("UNICODE_CI", "aaa", "bbb", false), + CollationTestCase("UNICODE_CI", "å", "a\u030A", true), + CollationTestCase("UNICODE_CI", "Å", "a\u030A", true) + ) checks.foreach(testCase => { val collation = fetchCollation(testCase.collationName) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala index f278b8e5899d8..cf6f29f9df097 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala @@ -174,10 +174,10 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { ("aa", "UTF8_BINARY_LCASE", UTF8String.fromString("aa").getBytes), ("AA", "UTF8_BINARY_LCASE", UTF8String.fromString("aa").getBytes), ("aA", "UTF8_BINARY_LCASE", UTF8String.fromString("aa").getBytes), - ("", "UNICODE", UTF8String.fromString("").getBytes), - ("aa", "UNICODE", UTF8String.fromString("aa").getBytes), - ("AA", "UNICODE", UTF8String.fromString("AA").getBytes), - ("aA", "UNICODE", UTF8String.fromString("aA").getBytes), + ("", "UNICODE", Array[Byte](1, 1, 0)), + ("aa", "UNICODE", Array[Byte](42, 42, 1, 6, 1, 6, 0)), + ("AA", "UNICODE", Array[Byte](42, 42, 1, 6, 1, -36, -36, 0)), + ("aA", "UNICODE", Array[Byte](42, 42, 1, 6, 1, -59, -36, 0)), ("", "UNICODE_CI", Array[Byte](1, 0)), ("aa", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0)), ("AA", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0)), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala index cc50aebf589e7..4085022e7ab8c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala @@ -34,10 +34,10 @@ class CollationRegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalH LikeTestCase("", "", "", "UTF8_BINARY_LCASE", true, true, true), LikeTestCase("Foo", "", "", "UTF8_BINARY_LCASE", false, false, true), LikeTestCase("", "%foo%", ".o.", "UTF8_BINARY_LCASE", false, false, false), - LikeTestCase("AbC", "%ABC%", ".B.", "UNICODE", false, true, false), - LikeTestCase(null, "%foo%", ".o.", "UNICODE", null, null, null), - LikeTestCase("Foo", null, null, "UNICODE", null, null, null), - LikeTestCase(null, null, null, "UNICODE", null, null, null) + LikeTestCase("AbC", "%ABC%", ".B.", "UTF8_BINARY", false, true, false), + LikeTestCase(null, "%foo%", ".o.", "UTF8_BINARY", null, null, null), + LikeTestCase("Foo", null, null, "UTF8_BINARY", null, null, null), + LikeTestCase(null, null, null, "UTF8_BINARY", null, null, null) ) testCases.foreach(t => { // Like @@ -62,13 +62,13 @@ class CollationRegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalH StringSplitTestCase("1A2B3C", "[abc]", "UTF8_BINARY", Seq("1A2B3C")), StringSplitTestCase("1A2B3C", "[ABC]", "UTF8_BINARY_LCASE", Seq("1", "2", "3", "")), StringSplitTestCase("1A2B3C", "[abc]", "UTF8_BINARY_LCASE", Seq("1", "2", "3", "")), - StringSplitTestCase("1A2B3C", "[1-9]+", "UNICODE", Seq("", "A", "B", "C")), - StringSplitTestCase("", "", "UNICODE", Seq("")), - StringSplitTestCase("1A2B3C", "", "UNICODE", Seq("1", "A", "2", "B", "3", "C")), - StringSplitTestCase("", "[1-9]+", "UNICODE", Seq("")), - StringSplitTestCase(null, "[1-9]+", "UNICODE", null), - StringSplitTestCase("1A2B3C", null, "UNICODE", null), - StringSplitTestCase(null, null, "UNICODE", null) + StringSplitTestCase("1A2B3C", "[1-9]+", "UTF8_BINARY", Seq("", "A", "B", "C")), + StringSplitTestCase("", "", "UTF8_BINARY", Seq("")), + StringSplitTestCase("1A2B3C", "", "UTF8_BINARY", Seq("1", "A", "2", "B", "3", "C")), + StringSplitTestCase("", "[1-9]+", "UTF8_BINARY", Seq("")), + StringSplitTestCase(null, "[1-9]+", "UTF8_BINARY", null), + StringSplitTestCase("1A2B3C", null, "UTF8_BINARY", null), + StringSplitTestCase(null, null, "UTF8_BINARY", null) ) testCases.foreach(t => { // StringSplit @@ -89,10 +89,10 @@ class CollationRegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalH RegexpTestCase("", "", "UTF8_BINARY_LCASE", "", Seq(""), 1), RegexpTestCase("Foo", "", "UTF8_BINARY_LCASE", "", Seq("", "", "", ""), 4), RegexpTestCase("", ".o.", "UTF8_BINARY_LCASE", "", Seq(), 0), - RegexpTestCase("Foo", ".O.", "UNICODE", "", Seq(), 0), - RegexpTestCase(null, ".O.", "UNICODE", null, null, null), - RegexpTestCase("Foo", null, "UNICODE", null, null, null), - RegexpTestCase(null, null, "UNICODE", null, null, null) + RegexpTestCase("Foo", ".O.", "UTF8_BINARY", "", Seq(), 0), + RegexpTestCase(null, ".O.", "UTF8_BINARY", null, null, null), + RegexpTestCase("Foo", null, "UTF8_BINARY", null, null, null), + RegexpTestCase(null, null, "UTF8_BINARY", null, null, null) ) testCases.foreach(t => { // RegExpExtract @@ -124,47 +124,46 @@ class CollationRegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalH // Supported collations (StringTypeBinaryLcase) val binaryCollation = StringType(CollationFactory.collationNameToId("UTF8_BINARY")) val lowercaseCollation = StringType(CollationFactory.collationNameToId("UTF8_BINARY_LCASE")) - val unicodeCollation = StringType(CollationFactory.collationNameToId("UNICODE")) // LikeAll checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%foo%", "%oo"), true) checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%foo%", "%bar%"), false) checkEvaluation(Literal.create("Foo", lowercaseCollation).likeAll("%foo%", "%oo"), true) checkEvaluation(Literal.create("Foo", lowercaseCollation).likeAll("%foo%", "%bar%"), false) - checkEvaluation(Literal.create("foo", unicodeCollation).likeAll("%foo%", "%oo"), true) - checkEvaluation(Literal.create("foo", unicodeCollation).likeAll("%foo%", "%bar%"), false) - checkEvaluation(Literal.create("foo", unicodeCollation).likeAll("%foo%", nullStr), null) - checkEvaluation(Literal.create("foo", unicodeCollation).likeAll("%feo%", nullStr), false) - checkEvaluation(Literal.create(null, unicodeCollation).likeAll("%foo%", "%oo"), null) + checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%foo%", "%oo"), true) + checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%foo%", "%bar%"), false) + checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%foo%", nullStr), null) + checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%feo%", nullStr), false) + checkEvaluation(Literal.create(null, binaryCollation).likeAll("%foo%", "%oo"), null) // NotLikeAll checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%foo%", "%oo"), false) checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%goo%", "%bar%"), true) checkEvaluation(Literal.create("Foo", lowercaseCollation).notLikeAll("%foo%", "%oo"), false) checkEvaluation(Literal.create("Foo", lowercaseCollation).notLikeAll("%goo%", "%bar%"), true) - checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAll("%foo%", "%oo"), false) - checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAll("%goo%", "%bar%"), true) - checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAll("%foo%", nullStr), false) - checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAll("%feo%", nullStr), null) - checkEvaluation(Literal.create(null, unicodeCollation).notLikeAll("%foo%", "%oo"), null) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%foo%", "%oo"), false) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%goo%", "%bar%"), true) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%foo%", nullStr), false) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%feo%", nullStr), null) + checkEvaluation(Literal.create(null, binaryCollation).notLikeAll("%foo%", "%oo"), null) // LikeAny checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%goo%", "%hoo"), false) checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%foo%", "%bar%"), true) checkEvaluation(Literal.create("Foo", lowercaseCollation).likeAny("%goo%", "%hoo"), false) checkEvaluation(Literal.create("Foo", lowercaseCollation).likeAny("%foo%", "%bar%"), true) - checkEvaluation(Literal.create("foo", unicodeCollation).likeAny("%goo%", "%hoo"), false) - checkEvaluation(Literal.create("foo", unicodeCollation).likeAny("%foo%", "%bar%"), true) - checkEvaluation(Literal.create("foo", unicodeCollation).likeAny("%foo%", nullStr), true) - checkEvaluation(Literal.create("foo", unicodeCollation).likeAny("%feo%", nullStr), null) - checkEvaluation(Literal.create(null, unicodeCollation).likeAny("%foo%", "%oo"), null) + checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%goo%", "%hoo"), false) + checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%foo%", "%bar%"), true) + checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%foo%", nullStr), true) + checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%feo%", nullStr), null) + checkEvaluation(Literal.create(null, binaryCollation).likeAny("%foo%", "%oo"), null) // NotLikeAny checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%foo%", "%hoo"), true) checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%foo%", "%oo%"), false) checkEvaluation(Literal.create("Foo", lowercaseCollation).notLikeAny("%Foo%", "%hoo"), true) checkEvaluation(Literal.create("Foo", lowercaseCollation).notLikeAny("%foo%", "%oo%"), false) - checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAny("%Foo%", "%hoo"), true) - checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAny("%foo%", "%oo%"), false) - checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAny("%foo%", nullStr), null) - checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAny("%feo%", nullStr), true) - checkEvaluation(Literal.create(null, unicodeCollation).notLikeAny("%foo%", "%oo"), null) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%Foo%", "%hoo"), true) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%foo%", "%oo%"), false) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%foo%", nullStr), null) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%feo%", nullStr), true) + checkEvaluation(Literal.create(null, binaryCollation).notLikeAny("%foo%", "%oo"), null) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala index 739b000492c55..7d894ac7eb4f7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -33,7 +33,7 @@ class CollationSQLRegexpSuite val testCases = Seq( LikeTestCase("ABC", "%B%", "UTF8_BINARY", true), LikeTestCase("AḂC", "%ḃ%", "UTF8_BINARY_LCASE", true), - LikeTestCase("ABC", "%b%", "UNICODE", false) + LikeTestCase("ABC", "%b%", "UTF8_BINARY", false) ) testCases.foreach(t => { val query = s"SELECT like(collate('${t.l}', '${t.c}'), '${t.r}')" @@ -61,7 +61,7 @@ class CollationSQLRegexpSuite val testCases = Seq( ILikeTestCase("ABC", "%b%", "UTF8_BINARY", true), ILikeTestCase("AḂC", "%ḃ%", "UTF8_BINARY_LCASE", true), - ILikeTestCase("ABC", "%b%", "UNICODE", true) + ILikeTestCase("ABC", "%b%", "UTF8_BINARY", true) ) testCases.foreach(t => { val query = s"SELECT ilike(collate('${t.l}', '${t.c}'), '${t.r}')" @@ -89,7 +89,7 @@ class CollationSQLRegexpSuite val testCases = Seq( LikeAllTestCase("foo", Seq("%foo%", "%oo"), "UTF8_BINARY", true), LikeAllTestCase("Foo", Seq("%foo%", "%oo"), "UTF8_BINARY_LCASE", true), - LikeAllTestCase("foo", Seq("%foo%", "%bar%"), "UNICODE", false) + LikeAllTestCase("foo", Seq("%foo%", "%bar%"), "UTF8_BINARY", false) ) testCases.foreach(t => { val query = s"SELECT collate('${t.s}', '${t.c}') LIKE ALL ('${t.p.mkString("','")}')" @@ -117,7 +117,7 @@ class CollationSQLRegexpSuite val testCases = Seq( NotLikeAllTestCase("foo", Seq("%foo%", "%oo"), "UTF8_BINARY", false), NotLikeAllTestCase("Foo", Seq("%foo%", "%oo"), "UTF8_BINARY_LCASE", false), - NotLikeAllTestCase("foo", Seq("%goo%", "%bar%"), "UNICODE", true) + NotLikeAllTestCase("foo", Seq("%goo%", "%bar%"), "UTF8_BINARY", true) ) testCases.foreach(t => { val query = s"SELECT collate('${t.s}', '${t.c}') NOT LIKE ALL ('${t.p.mkString("','")}')" @@ -145,7 +145,7 @@ class CollationSQLRegexpSuite val testCases = Seq( LikeAnyTestCase("foo", Seq("%foo%", "%bar"), "UTF8_BINARY", true), LikeAnyTestCase("Foo", Seq("%foo%", "%bar"), "UTF8_BINARY_LCASE", true), - LikeAnyTestCase("foo", Seq("%goo%", "%hoo%"), "UNICODE", false) + LikeAnyTestCase("foo", Seq("%goo%", "%hoo%"), "UTF8_BINARY", false) ) testCases.foreach(t => { val query = s"SELECT collate('${t.s}', '${t.c}') LIKE ANY ('${t.p.mkString("','")}')" @@ -173,7 +173,7 @@ class CollationSQLRegexpSuite val testCases = Seq( NotLikeAnyTestCase("foo", Seq("%foo%", "%hoo"), "UTF8_BINARY", true), NotLikeAnyTestCase("Foo", Seq("%foo%", "%hoo"), "UTF8_BINARY_LCASE", true), - NotLikeAnyTestCase("foo", Seq("%foo%", "%oo%"), "UNICODE", false) + NotLikeAnyTestCase("foo", Seq("%foo%", "%oo%"), "UTF8_BINARY", false) ) testCases.foreach(t => { val query = s"SELECT collate('${t.s}', '${t.c}') NOT LIKE ANY ('${t.p.mkString("','")}')" @@ -201,7 +201,7 @@ class CollationSQLRegexpSuite val testCases = Seq( RLikeTestCase("ABC", ".B.", "UTF8_BINARY", true), RLikeTestCase("AḂC", ".ḃ.", "UTF8_BINARY_LCASE", true), - RLikeTestCase("ABC", ".b.", "UNICODE", false) + RLikeTestCase("ABC", ".b.", "UTF8_BINARY", false) ) testCases.foreach(t => { val query = s"SELECT rlike(collate('${t.l}', '${t.c}'), '${t.r}')" @@ -229,7 +229,7 @@ class CollationSQLRegexpSuite val testCases = Seq( StringSplitTestCase("ABC", "[B]", "UTF8_BINARY", Seq("A", "C")), StringSplitTestCase("AḂC", "[ḃ]", "UTF8_BINARY_LCASE", Seq("A", "C")), - StringSplitTestCase("ABC", "[B]", "UNICODE", Seq("A", "C")) + StringSplitTestCase("ABC", "[B]", "UTF8_BINARY", Seq("A", "C")) ) testCases.foreach(t => { val query = s"SELECT split(collate('${t.l}', '${t.c}'), '${t.r}')" @@ -257,7 +257,7 @@ class CollationSQLRegexpSuite val testCases = Seq( RegExpReplaceTestCase("ABCDE", ".C.", "UTF8_BINARY", "AFFFE"), RegExpReplaceTestCase("ABĆDE", ".ć.", "UTF8_BINARY_LCASE", "AFFFE"), - RegExpReplaceTestCase("ABCDE", ".c.", "UNICODE", "ABCDE") + RegExpReplaceTestCase("ABCDE", ".c.", "UTF8_BINARY", "ABCDE") ) testCases.foreach(t => { val query = @@ -272,8 +272,9 @@ class CollationSQLRegexpSuite Row(t.result)) }) // Collation mismatch + val (c1, c2) = ("UTF8_BINARY", "UTF8_BINARY_LCASE") val collationMismatch = intercept[AnalysisException] { - sql("SELECT regexp_replace(collate('ABCDE','UTF8_BINARY'), '.c.', collate('FFF','UNICODE'))") + sql(s"SELECT regexp_replace(collate('ABCDE','$c1'), '.c.', collate('FFF','$c2'))") } assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") // Unsupported collations @@ -297,7 +298,7 @@ class CollationSQLRegexpSuite val testCases = Seq( RegExpExtractTestCase("ABCDE", ".C.", "UTF8_BINARY", "BCD"), RegExpExtractTestCase("ABĆDE", ".ć.", "UTF8_BINARY_LCASE", "BĆD"), - RegExpExtractTestCase("ABCDE", ".c.", "UNICODE", "") + RegExpExtractTestCase("ABCDE", ".c.", "UTF8_BINARY", "") ) testCases.foreach(t => { val query = @@ -327,7 +328,7 @@ class CollationSQLRegexpSuite val testCases = Seq( RegExpExtractAllTestCase("ABCDE", ".C.", "UTF8_BINARY", Seq("BCD")), RegExpExtractAllTestCase("ABĆDE", ".ć.", "UTF8_BINARY_LCASE", Seq("BĆD")), - RegExpExtractAllTestCase("ABCDE", ".c.", "UNICODE", Seq()) + RegExpExtractAllTestCase("ABCDE", ".c.", "UTF8_BINARY", Seq()) ) testCases.foreach(t => { val query = @@ -357,7 +358,7 @@ class CollationSQLRegexpSuite val testCases = Seq( RegExpCountTestCase("ABCDE", ".C.", "UTF8_BINARY", 1), RegExpCountTestCase("ABĆDE", ".ć.", "UTF8_BINARY_LCASE", 1), - RegExpCountTestCase("ABCDE", ".c.", "UNICODE", 0) + RegExpCountTestCase("ABCDE", ".c.", "UTF8_BINARY", 0) ) testCases.foreach(t => { val query = s"SELECT regexp_count(collate('${t.l}', '${t.c}'), '${t.r}')" @@ -385,7 +386,7 @@ class CollationSQLRegexpSuite val testCases = Seq( RegExpSubStrTestCase("ABCDE", ".C.", "UTF8_BINARY", "BCD"), RegExpSubStrTestCase("ABĆDE", ".ć.", "UTF8_BINARY_LCASE", "BĆD"), - RegExpSubStrTestCase("ABCDE", ".c.", "UNICODE", null) + RegExpSubStrTestCase("ABCDE", ".c.", "UTF8_BINARY", null) ) testCases.foreach(t => { val query = s"SELECT regexp_substr(collate('${t.l}', '${t.c}'), '${t.r}')" @@ -413,7 +414,7 @@ class CollationSQLRegexpSuite val testCases = Seq( RegExpInStrTestCase("ABCDE", ".C.", "UTF8_BINARY", 2), RegExpInStrTestCase("ABĆDE", ".ć.", "UTF8_BINARY_LCASE", 2), - RegExpInStrTestCase("ABCDE", ".c.", "UNICODE", 0) + RegExpInStrTestCase("ABCDE", ".c.", "UTF8_BINARY", 0) ) testCases.foreach(t => { val query = s"SELECT regexp_instr(collate('${t.l}', '${t.c}'), '${t.r}')" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala index 9cc123b708aff..db02946e3dfe5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala @@ -804,10 +804,12 @@ class CollationStringExpressionsSuite test("StringTrim* functions - unit tests for both paths (codegen and eval)") { // Without trimString param. - checkEvaluation(StringTrim(Literal.create( " asd ", StringType("UTF8_BINARY"))), "asd") + checkEvaluation( + StringTrim(Literal.create( " asd ", StringType("UTF8_BINARY"))), "asd") checkEvaluation( StringTrimLeft(Literal.create(" asd ", StringType("UTF8_BINARY_LCASE"))), "asd ") - checkEvaluation(StringTrimRight(Literal.create(" asd ", StringType("UNICODE"))), " asd") + checkEvaluation(StringTrimRight( + Literal.create(" asd ", StringType("UTF8_BINARY"))), " asd") // With trimString param. checkEvaluation( @@ -822,8 +824,8 @@ class CollationStringExpressionsSuite "asd ") checkEvaluation( StringTrimRight( - Literal.create(" asd ", StringType("UNICODE")), - Literal.create(" ", StringType("UNICODE"))), + Literal.create(" asd ", StringType("UTF8_BINARY")), + Literal.create(" ", StringType("UTF8_BINARY"))), " asd") checkEvaluation( @@ -838,8 +840,8 @@ class CollationStringExpressionsSuite "asdxx") checkEvaluation( StringTrimRight( - Literal.create("xxasdxx", StringType("UNICODE")), - Literal.create("x", StringType("UNICODE"))), + Literal.create("xxasdxx", StringType("UTF8_BINARY")), + Literal.create("x", StringType("UTF8_BINARY"))), "xxasd") } @@ -863,10 +865,10 @@ class CollationStringExpressionsSuite StringTrimTestCase("UTF8_BINARY_LCASE", "LTRIM", "xxasdxx", true, "x", "asdxx"), StringTrimTestCase("UTF8_BINARY_LCASE", "RTRIM", " asd ", false, null, " asd"), - StringTrimTestCase("UNICODE", "TRIM", "xxasdxx", true, "x", "asd"), - StringTrimTestCase("UNICODE", "BTRIM", "xxasdxx", true, "x", "asd"), - StringTrimTestCase("UNICODE", "LTRIM", " asd ", false, null, "asd "), - StringTrimTestCase("UNICODE", "RTRIM", " asd ", true, null, null) + StringTrimTestCase("UTF8_BINARY", "TRIM", "xxasdxx", true, "x", "asd"), + StringTrimTestCase("UTF8_BINARY", "BTRIM", "xxasdxx", true, "x", "asd"), + StringTrimTestCase("UTF8_BINARY", "LTRIM", " asd ", false, null, "asd "), + StringTrimTestCase("UTF8_BINARY", "RTRIM", " asd ", true, null, null) // Other more complex cases can be found in unit tests in CollationSupportSuite.java. ) @@ -906,7 +908,7 @@ class CollationStringExpressionsSuite + "COLLATE('x', 'UTF8_BINARY_LCASE'))"), expectedAnswer = Row("a")) checkAnswer( - df = sql("SELECT LTRIM(COLLATE('x', 'UNICODE'), COLLATE('xax', 'UNICODE'))"), + df = sql("SELECT LTRIM(COLLATE('x', 'UTF8_BINARY'), COLLATE('xax', 'UTF8_BINARY'))"), expectedAnswer = Row("ax")) checkAnswer( @@ -916,7 +918,7 @@ class CollationStringExpressionsSuite df = sql("SELECT TRIM('x', COLLATE('xax', 'UTF8_BINARY_LCASE'))"), expectedAnswer = Row("a")) checkAnswer( - df = sql("SELECT BTRIM('xax', COLLATE('x', 'UNICODE'))"), + df = sql("SELECT BTRIM('xax', COLLATE('x', 'UTF8_BINARY'))"), expectedAnswer = Row("a")) checkAnswer( @@ -926,7 +928,7 @@ class CollationStringExpressionsSuite df = sql("SELECT RTRIM(COLLATE('x', 'UTF8_BINARY_LCASE'), 'xax')"), expectedAnswer = Row("xa")) checkAnswer( - df = sql("SELECT TRIM(COLLATE('x', 'UNICODE'), 'xax')"), + df = sql("SELECT TRIM(COLLATE('x', 'UTF8_BINARY'), 'xax')"), expectedAnswer = Row("a")) } @@ -934,13 +936,13 @@ class CollationStringExpressionsSuite List("TRIM", "LTRIM", "RTRIM").foreach(func => { val collationMismatch = intercept[AnalysisException] { sql("SELECT " + func + "(COLLATE('x', 'UTF8_BINARY_LCASE'), " - + "COLLATE('xxaaaxx', 'UNICODE'))") + + "COLLATE('xxaaaxx', 'UTF8_BINARY'))") } assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") }) val collationMismatch = intercept[AnalysisException] { - sql("SELECT BTRIM(COLLATE('xxaaaxx', 'UNICODE'), COLLATE('x', 'UTF8_BINARY_LCASE'))") + sql("SELECT BTRIM(COLLATE('xxaaaxx', 'UTF8_BINARY'), COLLATE('x', 'UTF8_BINARY_LCASE'))") } assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala index 7c84e3e2d018b..7110b70104f38 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala @@ -491,7 +491,7 @@ class StreamingDeduplicationSuite extends StateStoreMetricsTest { val inputData = MemoryStream[(String, Int)] val result = inputData.toDF() .select(col("_1") - .try_cast(StringType("UNICODE")).as("str"), + .try_cast(StringType("UTF8_BINARY")).as("str"), col("_2").as("int")) .dropDuplicates("str")