Skip to content

Commit

Permalink
[SPARK-48403][SQL] Fix Lower & Upper expressions for UTF8_BINARY_LCAS…
Browse files Browse the repository at this point in the history
…E & ICU collations

### What changes were proposed in this pull request?
String lowercase/uppercase conversion in UTF8_BINARY_LCASE now works using ICU default locale, similar to how other ICU collations currently work in Spark.

### Why are the changes needed?
All collations apart from UTF8_BINARY should use the same interface (UCharacter) that utilizes ICU toLowerCase/toUpperCase implementation, rather than mixing JVM & ICU implementations.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Existing unit tests and e2e sql tests.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes #46720 from uros-db/lower-upper-initcap.

Authored-by: Uros Bojanic <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
  • Loading branch information
uros-db authored and cloud-fan committed Jun 10, 2024
1 parent 1901669 commit 61fd936
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -332,12 +332,58 @@ public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String
return buf.build();
}

/**
* Convert the input string to uppercase using the ICU root locale rules.
*
* @param target the input string
* @return the uppercase string
*/
public static UTF8String toUpperCase(final UTF8String target) {
return UTF8String.fromString(toUpperCase(target.toString()));
}

public static String toUpperCase(final String target) {
return UCharacter.toUpperCase(target);
}

/**
* Convert the input string to uppercase using the specified ICU collation rules.
*
* @param target the input string
* @return the uppercase string
*/
public static UTF8String toUpperCase(final UTF8String target, final int collationId) {
return UTF8String.fromString(toUpperCase(target.toString(), collationId));
}

public static String toUpperCase(final String target, final int collationId) {
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
return UCharacter.toUpperCase(locale, target);
}

/**
* Convert the input string to lowercase using the ICU root locale rules.
*
* @param target the input string
* @return the lowercase string
*/
public static UTF8String toLowerCase(final UTF8String target) {
return UTF8String.fromString(toLowerCase(target.toString()));
}
public static String toLowerCase(final String target) {
return UCharacter.toLowerCase(target);
}

/**
* Convert the input string to lowercase using the specified ICU collation rules.
*
* @param target the input string
* @return the lowercase string
*/
public static UTF8String toLowerCase(final UTF8String target, final int collationId) {
return UTF8String.fromString(toLowerCase(target.toString(), collationId));
}
public static String toLowerCase(final String target, final int collationId) {
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,52 +208,66 @@ public static boolean execICU(final UTF8String l, final UTF8String r,
public static class Upper {
public static UTF8String exec(final UTF8String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return execUTF8(v);
} else {
if (collation.supportsBinaryEquality) {
return execBinary(v);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
public static String genCode(final String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Upper.exec";
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return String.format(expr + "UTF8(%s)", v);
} else {
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s)", v);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s)", v);
} else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
}
}
public static UTF8String execUTF8(final UTF8String v) {
public static UTF8String execBinary(final UTF8String v) {
return v.toUpperCase();
}
public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toUpperCase(v);
}
public static UTF8String execICU(final UTF8String v, final int collationId) {
return UTF8String.fromString(CollationAwareUTF8String.toUpperCase(v.toString(), collationId));
return CollationAwareUTF8String.toUpperCase(v, collationId);
}
}

public static class Lower {
public static UTF8String exec(final UTF8String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return execUTF8(v);
if (collation.supportsBinaryEquality) {
return execBinary(v);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
public static String genCode(final String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Lower.exec";
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return String.format(expr + "UTF8(%s)", v);
} else {
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s)", v);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s)", v);
} else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
}
}
public static UTF8String execUTF8(final UTF8String v) {
public static UTF8String execBinary(final UTF8String v) {
return v.toLowerCase();
}
public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toLowerCase(v);
}
public static UTF8String execICU(final UTF8String v, final int collationId) {
return UTF8String.fromString(CollationAwareUTF8String.toLowerCase(v.toString(), collationId));
return CollationAwareUTF8String.toLowerCase(v, collationId);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -645,10 +645,14 @@ public void testUpper() throws SparkException {
assertUpper("ab世De", "UNICODE_CI", "AB世DE");
assertUpper("äbćδe", "UNICODE_CI", "ÄBĆΔE");
// Case-variable character length
assertUpper("i̇o", "UTF8_BINARY","İO");
assertUpper("i̇o", "UTF8_BINARY_LCASE","İO");
assertUpper("i̇o", "UNICODE","İO");
assertUpper("i̇o", "UNICODE_CI","İO");
assertUpper("i\u0307o", "UTF8_BINARY","I\u0307O");
assertUpper("i\u0307o", "UTF8_BINARY_LCASE","I\u0307O");
assertUpper("i\u0307o", "UNICODE","I\u0307O");
assertUpper("i\u0307o", "UNICODE_CI","I\u0307O");
assertUpper("ß fi ffi ff st ῗ", "UTF8_BINARY","SS FI FFI FF ST \u0399\u0308\u0342");
assertUpper("ß fi ffi ff st ῗ", "UTF8_BINARY_LCASE","SS FI FFI FF ST \u0399\u0308\u0342");
assertUpper("ß fi ffi ff st ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342");
assertUpper("ß fi ffi ff st ῗ", "UNICODE","SS FI FFI FF ST \u0399\u0308\u0342");
}

private void assertLower(String target, String collationName, String expected)
Expand Down Expand Up @@ -695,10 +699,10 @@ public void testLower() throws SparkException {
assertLower("aB世De", "UNICODE_CI", "ab世de");
assertLower("ÄBĆΔE", "UNICODE_CI", "äbćδe");
// Case-variable character length
assertLower("İo", "UTF8_BINARY","i̇o");
assertLower("İo", "UTF8_BINARY_LCASE","i̇o");
assertLower("İo", "UNICODE","i̇o");
assertLower("İo", "UNICODE_CI","i̇o");
assertLower("İo", "UTF8_BINARY","i\u0307o");
assertLower("İo", "UTF8_BINARY_LCASE","i\u0307o");
assertLower("İo", "UNICODE","i\u0307o");
assertLower("İo", "UNICODE_CI","i\u0307o");
}

private void assertInitCap(String target, String collationName, String expected)
Expand Down

0 comments on commit 61fd936

Please sign in to comment.