[SPARK-48403][SQL] Fix Lower & Upper expressions for UTF8_BINARY_LCAS…

…E & ICU collations ### What changes were proposed in this pull request? String lowercase/uppercase conversion in UTF8_BINARY_LCASE now works using ICU default locale, similar to how other ICU collations currently work in Spark. ### Why are the changes needed? All collations apart from UTF8_BINARY should use the same interface (UCharacter) that utilizes ICU toLowerCase/toUpperCase implementation, rather than mixing JVM & ICU implementations. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing unit tests and e2e sql tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46720 from uros-db/lower-upper-initcap. Authored-by: Uros Bojanic <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
apache · Jun 10, 2024 · 61fd936 · 61fd936
1 parent 1901669
commit 61fd936
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 23 deletions.
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -332,12 +332,58 @@ public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String
     return buf.build();
   }
 
+  /**
+   * Convert the input string to uppercase using the ICU root locale rules.
+   *
+   * @param target the input string
+   * @return the uppercase string
+   */
+  public static UTF8String toUpperCase(final UTF8String target) {
+    return UTF8String.fromString(toUpperCase(target.toString()));
+  }
+
+  public static String toUpperCase(final String target) {
+    return UCharacter.toUpperCase(target);
+  }
+
+  /**
+   * Convert the input string to uppercase using the specified ICU collation rules.
+   *
+   * @param target the input string
+   * @return the uppercase string
+   */
+  public static UTF8String toUpperCase(final UTF8String target, final int collationId) {
+    return UTF8String.fromString(toUpperCase(target.toString(), collationId));
+  }
+
   public static String toUpperCase(final String target, final int collationId) {
     ULocale locale = CollationFactory.fetchCollation(collationId)
       .collator.getLocale(ULocale.ACTUAL_LOCALE);
     return UCharacter.toUpperCase(locale, target);
   }
 
+  /**
+   * Convert the input string to lowercase using the ICU root locale rules.
+   *
+   * @param target the input string
+   * @return the lowercase string
+   */
+  public static UTF8String toLowerCase(final UTF8String target) {
+    return UTF8String.fromString(toLowerCase(target.toString()));
+  }
+  public static String toLowerCase(final String target) {
+    return UCharacter.toLowerCase(target);
+  }
+
+  /**
+   * Convert the input string to lowercase using the specified ICU collation rules.
+   *
+   * @param target the input string
+   * @return the lowercase string
+   */
+  public static UTF8String toLowerCase(final UTF8String target, final int collationId) {
+    return UTF8String.fromString(toLowerCase(target.toString(), collationId));
+  }
   public static String toLowerCase(final String target, final int collationId) {
     ULocale locale = CollationFactory.fetchCollation(collationId)
       .collator.getLocale(ULocale.ACTUAL_LOCALE);

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -208,52 +208,66 @@ public static boolean execICU(final UTF8String l, final UTF8String r,
   public static class Upper {
     public static UTF8String exec(final UTF8String v, final int collationId) {
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
-      if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
-        return execUTF8(v);
-      } else {
+      if (collation.supportsBinaryEquality) {
+        return execBinary(v);
+      } else if (collation.supportsLowercaseEquality) {
+        return execLowercase(v);
+      }  else {
         return execICU(v, collationId);
       }
     }
     public static String genCode(final String v, final int collationId) {
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
       String expr = "CollationSupport.Upper.exec";
-      if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
-        return String.format(expr + "UTF8(%s)", v);
-      } else {
+      if (collation.supportsBinaryEquality) {
+        return String.format(expr + "Binary(%s)", v);
+      } else if (collation.supportsLowercaseEquality) {
+        return String.format(expr + "Lowercase(%s)", v);
+      }  else {
         return String.format(expr + "ICU(%s, %d)", v, collationId);
       }
     }
-    public static UTF8String execUTF8(final UTF8String v) {
+    public static UTF8String execBinary(final UTF8String v) {
       return v.toUpperCase();
     }
+    public static UTF8String execLowercase(final UTF8String v) {
+      return CollationAwareUTF8String.toUpperCase(v);
+    }
     public static UTF8String execICU(final UTF8String v, final int collationId) {
-      return UTF8String.fromString(CollationAwareUTF8String.toUpperCase(v.toString(), collationId));
+      return CollationAwareUTF8String.toUpperCase(v, collationId);
     }
   }
 
   public static class Lower {
     public static UTF8String exec(final UTF8String v, final int collationId) {
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
-      if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
-        return execUTF8(v);
+      if (collation.supportsBinaryEquality) {
+        return execBinary(v);
+      } else if (collation.supportsLowercaseEquality) {
+        return execLowercase(v);
       } else {
         return execICU(v, collationId);
       }
     }
     public static String genCode(final String v, final int collationId) {
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
         String expr = "CollationSupport.Lower.exec";
-      if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
-        return String.format(expr + "UTF8(%s)", v);
-      } else {
+      if (collation.supportsBinaryEquality) {
+        return String.format(expr + "Binary(%s)", v);
+      } else if (collation.supportsLowercaseEquality) {
+        return String.format(expr + "Lowercase(%s)", v);
+      }  else {
         return String.format(expr + "ICU(%s, %d)", v, collationId);
       }
     }
-    public static UTF8String execUTF8(final UTF8String v) {
+    public static UTF8String execBinary(final UTF8String v) {
       return v.toLowerCase();
     }
+    public static UTF8String execLowercase(final UTF8String v) {
+      return CollationAwareUTF8String.toLowerCase(v);
+    }
     public static UTF8String execICU(final UTF8String v, final int collationId) {
-      return UTF8String.fromString(CollationAwareUTF8String.toLowerCase(v.toString(), collationId));
+      return CollationAwareUTF8String.toLowerCase(v, collationId);
     }
   }
 

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -645,10 +645,14 @@ public void testUpper() throws SparkException {
     assertUpper("ab世De", "UNICODE_CI", "AB世DE");
     assertUpper("äbćδe", "UNICODE_CI", "ÄBĆΔE");
     // Case-variable character length
-    assertUpper("i̇o", "UTF8_BINARY","İO");
-    assertUpper("i̇o", "UTF8_BINARY_LCASE","İO");
-    assertUpper("i̇o", "UNICODE","İO");
-    assertUpper("i̇o", "UNICODE_CI","İO");
+    assertUpper("i\u0307o", "UTF8_BINARY","I\u0307O");
+    assertUpper("i\u0307o", "UTF8_BINARY_LCASE","I\u0307O");
+    assertUpper("i\u0307o", "UNICODE","I\u0307O");
+    assertUpper("i\u0307o", "UNICODE_CI","I\u0307O");
+    assertUpper("ß ﬁ ﬃ ﬀ ﬆ ῗ", "UTF8_BINARY","SS FI FFI FF ST \u0399\u0308\u0342");
+    assertUpper("ß ﬁ ﬃ ﬀ ﬆ ῗ", "UTF8_BINARY_LCASE","SS FI FFI FF ST \u0399\u0308\u0342");
+    assertUpper("ß ﬁ ﬃ ﬀ ﬆ ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342");
+    assertUpper("ß ﬁ ﬃ ﬀ ﬆ ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342");
   }
 
   private void assertLower(String target, String collationName, String expected)
@@ -695,10 +699,10 @@ public void testLower() throws SparkException {
     assertLower("aB世De", "UNICODE_CI", "ab世de");
     assertLower("ÄBĆΔE", "UNICODE_CI", "äbćδe");
     // Case-variable character length
-    assertLower("İo", "UTF8_BINARY","i̇o");
-    assertLower("İo", "UTF8_BINARY_LCASE","i̇o");
-    assertLower("İo", "UNICODE","i̇o");
-    assertLower("İo", "UNICODE_CI","i̇o");
+    assertLower("İo", "UTF8_BINARY","i\u0307o");
+    assertLower("İo", "UTF8_BINARY_LCASE","i\u0307o");
+    assertLower("İo", "UNICODE","i\u0307o");
+    assertLower("İo", "UNICODE_CI","i\u0307o");
   }
 
   private void assertInitCap(String target, String collationName, String expected)