Skip to content

Commit

Permalink
Merge branch 'apache:master' into skipSnapshotAtBatch
Browse files Browse the repository at this point in the history
  • Loading branch information
eason-yuchen-liu authored Jun 10, 2024
2 parents 9d902d7 + 5a2f374 commit eddb3c7
Show file tree
Hide file tree
Showing 124 changed files with 3,030 additions and 729 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/build_sparkr_window.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# specific language governing permissions and limitations
# under the License.
#
name: "Build / SparkR-only (master, 4.3.3, windows-2019)"
name: "Build / SparkR-only (master, 4.4.0, windows-2019)"

on:
schedule:
Expand Down Expand Up @@ -50,10 +50,10 @@ jobs:
with:
distribution: zulu
java-version: 17
- name: Install R 4.3.3
- name: Install R 4.4.0
uses: r-lib/actions/setup-r@v2
with:
r-version: 4.3.3
r-version: 4.4.0
- name: Install R dependencies
run: |
Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')"
Expand Down
1 change: 0 additions & 1 deletion LICENSE-binary
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,6 @@ com.google.crypto.tink:tink
com.google.flatbuffers:flatbuffers-java
com.google.guava:guava
com.jamesmurty.utils:java-xmlbuilder
com.jolbox:bonecp
com.ning:compress-lzf
com.squareup.okhttp3:logging-interceptor
com.squareup.okhttp3:okhttp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,54 @@ private static int lowercaseRFind(
return MATCH_NOT_FOUND;
}

/**
* Lowercase UTF8String comparison used for UTF8_BINARY_LCASE collation. While the default
* UTF8String comparison is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()), this
* method uses code points to compare the strings in a case-insensitive manner using ICU rules,
* as well as handling special rules for one-to-many case mappings (see: lowerCaseCodePoints).
*
* @param left The first UTF8String to compare.
* @param right The second UTF8String to compare.
* @return An integer representing the comparison result.
*/
public static int compareLowerCase(final UTF8String left, final UTF8String right) {
// Only if both strings are ASCII, we can use faster comparison (no string allocations).
if (left.isFullAscii() && right.isFullAscii()) {
return compareLowerCaseAscii(left, right);
}
return compareLowerCaseSlow(left, right);
}

/**
* Fast version of the `compareLowerCase` method, used when both arguments are ASCII strings.
*
* @param left The first ASCII UTF8String to compare.
* @param right The second ASCII UTF8String to compare.
* @return An integer representing the comparison result.
*/
private static int compareLowerCaseAscii(final UTF8String left, final UTF8String right) {
int leftBytes = left.numBytes(), rightBytes = right.numBytes();
for (int curr = 0; curr < leftBytes && curr < rightBytes; curr++) {
int lowerLeftByte = Character.toLowerCase(left.getByte(curr));
int lowerRightByte = Character.toLowerCase(right.getByte(curr));
if (lowerLeftByte != lowerRightByte) {
return lowerLeftByte - lowerRightByte;
}
}
return leftBytes - rightBytes;
}

/**
* Slow version of the `compareLowerCase` method, used when both arguments are non-ASCII strings.
*
* @param left The first non-ASCII UTF8String to compare.
* @param right The second non-ASCII UTF8String to compare.
* @return An integer representing the comparison result.
*/
private static int compareLowerCaseSlow(final UTF8String left, final UTF8String right) {
return lowerCaseCodePoints(left.toString()).compareTo(lowerCaseCodePoints(right.toString()));
}

public static UTF8String replace(final UTF8String src, final UTF8String search,
final UTF8String replace, final int collationId) {
// This collation aware implementation is based on existing implementation on UTF8String
Expand Down Expand Up @@ -284,18 +332,124 @@ public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String
return buf.build();
}

/**
* Convert the input string to uppercase using the ICU root locale rules.
*
* @param target the input string
* @return the uppercase string
*/
public static UTF8String toUpperCase(final UTF8String target) {
return UTF8String.fromString(toUpperCase(target.toString()));
}

public static String toUpperCase(final String target) {
return UCharacter.toUpperCase(target);
}

/**
* Convert the input string to uppercase using the specified ICU collation rules.
*
* @param target the input string
* @return the uppercase string
*/
public static UTF8String toUpperCase(final UTF8String target, final int collationId) {
return UTF8String.fromString(toUpperCase(target.toString(), collationId));
}

public static String toUpperCase(final String target, final int collationId) {
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
return UCharacter.toUpperCase(locale, target);
}

/**
* Convert the input string to lowercase using the ICU root locale rules.
*
* @param target the input string
* @return the lowercase string
*/
public static UTF8String toLowerCase(final UTF8String target) {
return UTF8String.fromString(toLowerCase(target.toString()));
}
public static String toLowerCase(final String target) {
return UCharacter.toLowerCase(target);
}

/**
* Convert the input string to lowercase using the specified ICU collation rules.
*
* @param target the input string
* @return the lowercase string
*/
public static UTF8String toLowerCase(final UTF8String target, final int collationId) {
return UTF8String.fromString(toLowerCase(target.toString(), collationId));
}
public static String toLowerCase(final String target, final int collationId) {
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
return UCharacter.toLowerCase(locale, target);
}

/**
* Converts a single code point to lowercase using ICU rules, with special handling for
* one-to-many case mappings (i.e. characters that map to multiple characters in lowercase) and
* context-insensitive case mappings (i.e. characters that map to different characters based on
* string context - e.g. the position in the string relative to other characters).
*
* @param codePoint The code point to convert to lowercase.
* @param sb The StringBuilder to append the lowercase character to.
*/
private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) {
if (codePoint == 0x0130) {
// Latin capital letter I with dot above is mapped to 2 lowercase characters.
sb.appendCodePoint(0x0069);
sb.appendCodePoint(0x0307);
}
else if (codePoint == 0x03C2) {
// Greek final and non-final capital letter sigma should be mapped the same.
sb.appendCodePoint(0x03C3);
}
else {
// All other characters should follow context-unaware ICU single-code point case mapping.
sb.appendCodePoint(UCharacter.toLowerCase(codePoint));
}
}

/**
* Converts an entire string to lowercase using ICU rules, code point by code point, with
* special handling for one-to-many case mappings (i.e. characters that map to multiple
* characters in lowercase). Also, this method omits information about context-sensitive case
* mappings using special handling in the `lowercaseCodePoint` method.
*
* @param target The target string to convert to lowercase.
* @return The string converted to lowercase in a context-unaware manner.
*/
public static String lowerCaseCodePoints(final String target) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < target.length(); ++i) {
lowercaseCodePoint(target.codePointAt(i), sb);
}
return sb.toString();
}

/**
* Convert the input string to titlecase using the ICU root locale rules.
*/
public static UTF8String toTitleCase(final UTF8String target) {
return UTF8String.fromString(toTitleCase(target.toString()));
}

public static String toTitleCase(final String target) {
return UCharacter.toTitleCase(target, BreakIterator.getWordInstance());
}

/**
* Convert the input string to titlecase using the specified ICU collation rules.
*/
public static UTF8String toTitleCase(final UTF8String target, final int collationId) {
return UTF8String.fromString(toTitleCase(target.toString(), collationId));
}

public static String toTitleCase(final String target, final int collationId) {
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
Expand Down Expand Up @@ -351,9 +505,8 @@ public static int lowercaseIndexOf(final UTF8String target, final UTF8String pat

public static int indexOf(final UTF8String target, final UTF8String pattern,
final int start, final int collationId) {
if (pattern.numBytes() == 0) {
return target.indexOfEmpty(start);
}
if (pattern.numBytes() == 0) return target.indexOfEmpty(start);
if (target.numBytes() == 0) return MATCH_NOT_FOUND;

StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId);
stringSearch.setIndex(start);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -412,9 +412,9 @@ protected Collation buildCollation() {
"UTF8_BINARY_LCASE",
PROVIDER_SPARK,
null,
UTF8String::compareLowerCase,
CollationAwareUTF8String::compareLowerCase,
"1.0",
s -> (long) s.toLowerCase().hashCode(),
s -> (long) CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(),
/* supportsBinaryEquality = */ false,
/* supportsBinaryOrdering = */ false,
/* supportsLowercaseEquality = */ true);
Expand Down Expand Up @@ -671,7 +671,7 @@ protected Collation buildCollation() {
(s1, s2) -> collator.compare(s1.toString(), s2.toString()),
ICU_COLLATOR_VERSION,
s -> (long) collator.getCollationKey(s.toString()).hashCode(),
/* supportsBinaryEquality = */ collationId == UNICODE_COLLATION_ID,
/* supportsBinaryEquality = */ false,
/* supportsBinaryOrdering = */ false,
/* supportsLowercaseEquality = */ false);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,60 +208,76 @@ public static boolean execICU(final UTF8String l, final UTF8String r,
public static class Upper {
public static UTF8String exec(final UTF8String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return execUTF8(v);
} else {
if (collation.supportsBinaryEquality) {
return execBinary(v);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
public static String genCode(final String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Upper.exec";
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return String.format(expr + "UTF8(%s)", v);
} else {
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s)", v);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s)", v);
} else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
}
}
public static UTF8String execUTF8(final UTF8String v) {
public static UTF8String execBinary(final UTF8String v) {
return v.toUpperCase();
}
public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toUpperCase(v);
}
public static UTF8String execICU(final UTF8String v, final int collationId) {
return UTF8String.fromString(CollationAwareUTF8String.toUpperCase(v.toString(), collationId));
return CollationAwareUTF8String.toUpperCase(v, collationId);
}
}

public static class Lower {
public static UTF8String exec(final UTF8String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return execUTF8(v);
if (collation.supportsBinaryEquality) {
return execBinary(v);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
public static String genCode(final String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Lower.exec";
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return String.format(expr + "UTF8(%s)", v);
} else {
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s)", v);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s)", v);
} else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
}
}
public static UTF8String execUTF8(final UTF8String v) {
public static UTF8String execBinary(final UTF8String v) {
return v.toLowerCase();
}
public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toLowerCase(v);
}
public static UTF8String execICU(final UTF8String v, final int collationId) {
return UTF8String.fromString(CollationAwareUTF8String.toLowerCase(v.toString(), collationId));
return CollationAwareUTF8String.toLowerCase(v, collationId);
}
}

public static class InitCap {
public static UTF8String exec(final UTF8String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return execUTF8(v);
if (collation.supportsBinaryEquality) {
return execBinary(v);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(v);
} else {
return execICU(v, collationId);
}
Expand All @@ -270,25 +286,22 @@ public static UTF8String exec(final UTF8String v, final int collationId) {
public static String genCode(final String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.InitCap.exec";
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return String.format(expr + "UTF8(%s)", v);
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s)", v);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s)", v);
} else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
}
}

public static UTF8String execUTF8(final UTF8String v) {
public static UTF8String execBinary(final UTF8String v) {
return v.toLowerCase().toTitleCase();
}

public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toTitleCase(v);
}
public static UTF8String execICU(final UTF8String v, final int collationId) {
return UTF8String.fromString(
CollationAwareUTF8String.toTitleCase(
CollationAwareUTF8String.toLowerCase(
v.toString(),
collationId
),
collationId));
return CollationAwareUTF8String.toTitleCase(v, collationId);
}
}

Expand Down
Loading

0 comments on commit eddb3c7

Please sign in to comment.