[SPARK-48410][SQL] Fix InitCap expression for UTF8_BINARY_LCASE & ICU…

… collations ### What changes were proposed in this pull request? String titlecase conversion under UTF8_BINARY_LCASE and other ICU collations now work using the appropriate ICU default locale for character mapping, and uses ICU BreakIterator.getWordInstance to locate boundaries between words. ### Why are the changes needed? Similar Spark expressions such as Lower & Upper use the same interface (UCharacter) to perform collation-aware string transformation, and InitCap should offer a consistant way to titlecase strings across the collation space. ### Does this PR introduce _any_ user-facing change? Yes, InitCap should now work properly for all collations other than UTF8_BINARY. ### How was this patch tested? New and existing unit tests, as well as existing e2e sql tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46732 from uros-db/initcap-icu. Authored-by: Uros Bojanic <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
apache · Jun 10, 2024 · 3857a9d · 3857a9d
1 parent 61fd936
commit 3857a9d
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 18 deletions.
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -432,6 +432,24 @@ public static String lowerCaseCodePoints(final String target) {
     return sb.toString();
   }
 
+  /**
+   * Convert the input string to titlecase using the ICU root locale rules.
+   */
+  public static UTF8String toTitleCase(final UTF8String target) {
+    return UTF8String.fromString(toTitleCase(target.toString()));
+  }
+
+  public static String toTitleCase(final String target) {
+    return UCharacter.toTitleCase(target, BreakIterator.getWordInstance());
+  }
+
+  /**
+   * Convert the input string to titlecase using the specified ICU collation rules.
+   */
+  public static UTF8String toTitleCase(final UTF8String target, final int collationId) {
+    return UTF8String.fromString(toTitleCase(target.toString(), collationId));
+  }
+
   public static String toTitleCase(final String target, final int collationId) {
     ULocale locale = CollationFactory.fetchCollation(collationId)
       .collator.getLocale(ULocale.ACTUAL_LOCALE);

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -274,8 +274,10 @@ public static UTF8String execICU(final UTF8String v, final int collationId) {
   public static class InitCap {
     public static UTF8String exec(final UTF8String v, final int collationId) {
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
-      if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
-        return execUTF8(v);
+      if (collation.supportsBinaryEquality) {
+        return execBinary(v);
+      } else if (collation.supportsLowercaseEquality) {
+        return execLowercase(v);
       } else {
         return execICU(v, collationId);
       }
@@ -284,25 +286,22 @@ public static UTF8String exec(final UTF8String v, final int collationId) {
     public static String genCode(final String v, final int collationId) {
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
       String expr = "CollationSupport.InitCap.exec";
-      if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
-        return String.format(expr + "UTF8(%s)", v);
+      if (collation.supportsBinaryEquality) {
+        return String.format(expr + "Binary(%s)", v);
+      } else if (collation.supportsLowercaseEquality) {
+        return String.format(expr + "Lowercase(%s)", v);
       } else {
         return String.format(expr + "ICU(%s, %d)", v, collationId);
       }
     }
-
-    public static UTF8String execUTF8(final UTF8String v) {
+    public static UTF8String execBinary(final UTF8String v) {
       return v.toLowerCase().toTitleCase();
     }
-
+    public static UTF8String execLowercase(final UTF8String v) {
+      return CollationAwareUTF8String.toTitleCase(v);
+    }
     public static UTF8String execICU(final UTF8String v, final int collationId) {
-      return UTF8String.fromString(
-              CollationAwareUTF8String.toTitleCase(
-                      CollationAwareUTF8String.toLowerCase(
-                              v.toString(),
-                              collationId
-                      ),
-                      collationId));
+      return CollationAwareUTF8String.toTitleCase(v, collationId);
     }
   }
 

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -749,10 +749,48 @@ public void testInitCap() throws SparkException {
     assertInitCap("aB 世 de", "UNICODE_CI", "Ab 世 De");
     assertInitCap("ÄBĆΔE", "UNICODE_CI", "Äbćδe");
     // Case-variable character length
-    assertInitCap("İo", "UTF8_BINARY", "İo");
-    assertInitCap("İo", "UTF8_BINARY_LCASE", "İo");
-    assertInitCap("İo", "UNICODE", "İo");
-    assertInitCap("İo", "UNICODE_CI", "İo");
+    assertInitCap("İo", "UTF8_BINARY", "I\u0307o");
+    assertInitCap("İo", "UTF8_BINARY_LCASE", "İo");
+    assertInitCap("İo", "UNICODE", "İo");
+    assertInitCap("İo", "UNICODE_CI", "İo");
+    assertInitCap("i\u0307o", "UTF8_BINARY", "I\u0307o");
+    assertInitCap("i\u0307o", "UTF8_BINARY_LCASE", "I\u0307o");
+    assertInitCap("i\u0307o", "UNICODE", "I\u0307o");
+    assertInitCap("i\u0307o", "UNICODE_CI", "I\u0307o");
+    // Different possible word boundaries
+    assertInitCap("a b c", "UTF8_BINARY", "A B C");
+    assertInitCap("a b c", "UNICODE", "A B C");
+    assertInitCap("a b c", "UTF8_BINARY_LCASE", "A B C");
+    assertInitCap("a b c", "UNICODE_CI", "A B C");
+    assertInitCap("a.b,c", "UTF8_BINARY", "A.b,c");
+    assertInitCap("a.b,c", "UNICODE", "A.b,C");
+    assertInitCap("a.b,c", "UTF8_BINARY_LCASE", "A.b,C");
+    assertInitCap("a.b,c", "UNICODE_CI", "A.b,C");
+    assertInitCap("a. b-c", "UTF8_BINARY", "A. B-c");
+    assertInitCap("a. b-c", "UNICODE", "A. B-C");
+    assertInitCap("a. b-c", "UTF8_BINARY_LCASE", "A. B-C");
+    assertInitCap("a. b-c", "UNICODE_CI", "A. B-C");
+    assertInitCap("a?b世c", "UTF8_BINARY", "A?b世c");
+    assertInitCap("a?b世c", "UNICODE", "A?B世C");
+    assertInitCap("a?b世c", "UTF8_BINARY_LCASE", "A?B世C");
+    assertInitCap("a?b世c", "UNICODE_CI", "A?B世C");
+    // Titlecase characters that are different from uppercase characters
+    assertInitCap("ǳǱǲ", "UTF8_BINARY", "ǲǳǳ");
+    assertInitCap("ǳǱǲ", "UNICODE", "ǲǳǳ");
+    assertInitCap("ǳǱǲ", "UTF8_BINARY_LCASE", "ǲǳǳ");
+    assertInitCap("ǳǱǲ", "UNICODE_CI", "ǲǳǳ");
+    assertInitCap("ǆaba ǈubav Ǌegova", "UTF8_BINARY", "ǅaba ǈubav ǋegova");
+    assertInitCap("ǆaba ǈubav Ǌegova", "UNICODE", "ǅaba ǈubav ǋegova");
+    assertInitCap("ǆaba ǈubav Ǌegova", "UTF8_BINARY_LCASE", "ǅaba ǈubav ǋegova");
+    assertInitCap("ǆaba ǈubav Ǌegova", "UNICODE_CI", "ǅaba ǈubav ǋegova");
+    assertInitCap("ß ﬁ ﬃ ﬀ ﬆ ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY",
+      "ß ﬁ ﬃ ﬀ ﬆ Σημερινος Ασημενιος I\u0307ota");
+    assertInitCap("ß ﬁ ﬃ ﬀ ﬆ ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY_LCASE",
+      "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota");
+    assertInitCap("ß ﬁ ﬃ ﬀ ﬆ ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE",
+      "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota");
+    assertInitCap("ß ﬁ ﬃ ﬀ ﬆ ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE_CI",
+      "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota");
   }
 
   private void assertStringInstr(String string, String substring, String collationName,