Skip to content

Commit

Permalink
[SPARK-48410][SQL] Fix InitCap expression for UTF8_BINARY_LCASE & ICU…
Browse files Browse the repository at this point in the history
… collations

### What changes were proposed in this pull request?
String titlecase conversion under UTF8_BINARY_LCASE and other ICU collations now work using the appropriate ICU default locale for character mapping, and uses ICU BreakIterator.getWordInstance to locate boundaries between words.

### Why are the changes needed?
Similar Spark expressions such as Lower & Upper use the same interface (UCharacter) to perform collation-aware string transformation, and InitCap should offer a consistant way to titlecase strings across the collation space.

### Does this PR introduce _any_ user-facing change?
Yes, InitCap should now work properly for all collations other than UTF8_BINARY.

### How was this patch tested?
New and existing unit tests, as well as existing e2e sql tests.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes #46732 from uros-db/initcap-icu.

Authored-by: Uros Bojanic <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
  • Loading branch information
uros-db authored and cloud-fan committed Jun 10, 2024
1 parent 61fd936 commit 3857a9d
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,24 @@ public static String lowerCaseCodePoints(final String target) {
return sb.toString();
}

/**
* Convert the input string to titlecase using the ICU root locale rules.
*/
public static UTF8String toTitleCase(final UTF8String target) {
return UTF8String.fromString(toTitleCase(target.toString()));
}

public static String toTitleCase(final String target) {
return UCharacter.toTitleCase(target, BreakIterator.getWordInstance());
}

/**
* Convert the input string to titlecase using the specified ICU collation rules.
*/
public static UTF8String toTitleCase(final UTF8String target, final int collationId) {
return UTF8String.fromString(toTitleCase(target.toString(), collationId));
}

public static String toTitleCase(final String target, final int collationId) {
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,10 @@ public static UTF8String execICU(final UTF8String v, final int collationId) {
public static class InitCap {
public static UTF8String exec(final UTF8String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return execUTF8(v);
if (collation.supportsBinaryEquality) {
return execBinary(v);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(v);
} else {
return execICU(v, collationId);
}
Expand All @@ -284,25 +286,22 @@ public static UTF8String exec(final UTF8String v, final int collationId) {
public static String genCode(final String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.InitCap.exec";
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return String.format(expr + "UTF8(%s)", v);
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s)", v);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s)", v);
} else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
}
}

public static UTF8String execUTF8(final UTF8String v) {
public static UTF8String execBinary(final UTF8String v) {
return v.toLowerCase().toTitleCase();
}

public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toTitleCase(v);
}
public static UTF8String execICU(final UTF8String v, final int collationId) {
return UTF8String.fromString(
CollationAwareUTF8String.toTitleCase(
CollationAwareUTF8String.toLowerCase(
v.toString(),
collationId
),
collationId));
return CollationAwareUTF8String.toTitleCase(v, collationId);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -749,10 +749,48 @@ public void testInitCap() throws SparkException {
assertInitCap("aB 世 de", "UNICODE_CI", "Ab 世 De");
assertInitCap("ÄBĆΔE", "UNICODE_CI", "Äbćδe");
// Case-variable character length
assertInitCap("İo", "UTF8_BINARY", "İo");
assertInitCap("İo", "UTF8_BINARY_LCASE", "İo");
assertInitCap("İo", "UNICODE", "İo");
assertInitCap("İo", "UNICODE_CI", "İo");
assertInitCap("İo", "UTF8_BINARY", "I\u0307o");
assertInitCap("İo", "UTF8_BINARY_LCASE", "İo");
assertInitCap("İo", "UNICODE", "İo");
assertInitCap("İo", "UNICODE_CI", "İo");
assertInitCap("i\u0307o", "UTF8_BINARY", "I\u0307o");
assertInitCap("i\u0307o", "UTF8_BINARY_LCASE", "I\u0307o");
assertInitCap("i\u0307o", "UNICODE", "I\u0307o");
assertInitCap("i\u0307o", "UNICODE_CI", "I\u0307o");
// Different possible word boundaries
assertInitCap("a b c", "UTF8_BINARY", "A B C");
assertInitCap("a b c", "UNICODE", "A B C");
assertInitCap("a b c", "UTF8_BINARY_LCASE", "A B C");
assertInitCap("a b c", "UNICODE_CI", "A B C");
assertInitCap("a.b,c", "UTF8_BINARY", "A.b,c");
assertInitCap("a.b,c", "UNICODE", "A.b,C");
assertInitCap("a.b,c", "UTF8_BINARY_LCASE", "A.b,C");
assertInitCap("a.b,c", "UNICODE_CI", "A.b,C");
assertInitCap("a. b-c", "UTF8_BINARY", "A. B-c");
assertInitCap("a. b-c", "UNICODE", "A. B-C");
assertInitCap("a. b-c", "UTF8_BINARY_LCASE", "A. B-C");
assertInitCap("a. b-c", "UNICODE_CI", "A. B-C");
assertInitCap("a?b世c", "UTF8_BINARY", "A?b世c");
assertInitCap("a?b世c", "UNICODE", "A?B世C");
assertInitCap("a?b世c", "UTF8_BINARY_LCASE", "A?B世C");
assertInitCap("a?b世c", "UNICODE_CI", "A?B世C");
// Titlecase characters that are different from uppercase characters
assertInitCap("dzDZDz", "UTF8_BINARY", "Dzdzdz");
assertInitCap("dzDZDz", "UNICODE", "Dzdzdz");
assertInitCap("dzDZDz", "UTF8_BINARY_LCASE", "Dzdzdz");
assertInitCap("dzDZDz", "UNICODE_CI", "Dzdzdz");
assertInitCap("džaba Ljubav NJegova", "UTF8_BINARY", "Džaba Ljubav Njegova");
assertInitCap("džaba Ljubav NJegova", "UNICODE", "Džaba Ljubav Njegova");
assertInitCap("džaba Ljubav NJegova", "UTF8_BINARY_LCASE", "Džaba Ljubav Njegova");
assertInitCap("džaba Ljubav NJegova", "UNICODE_CI", "Džaba Ljubav Njegova");
assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY",
"ß fi ffi ff st Σημερινος Ασημενιος I\u0307ota");
assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY_LCASE",
"Ss Fi Ffi Ff St Σημερινος Ασημενιος İota");
assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE",
"Ss Fi Ffi Ff St Σημερινος Ασημενιος İota");
assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE_CI",
"Ss Fi Ffi Ff St Σημερινος Ασημενιος İota");
}

private void assertStringInstr(String string, String substring, String collationName,
Expand Down

0 comments on commit 3857a9d

Please sign in to comment.