Merge branch 'apache:master' into skipSnapshotAtBatch

eason-yuchen-liu · Jun 10, 2024 · eddb3c7 · eddb3c7
2 parents 9d902d7 + 5a2f374
commit eddb3c7
Show file tree

Hide file tree

Showing 124 changed files with 3,030 additions and 729 deletions.
diff --git a/.github/workflows/build_sparkr_window.yml b/.github/workflows/build_sparkr_window.yml
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-name: "Build / SparkR-only (master, 4.3.3, windows-2019)"
+name: "Build / SparkR-only (master, 4.4.0, windows-2019)"
 
 on:
   schedule:
@@ -50,10 +50,10 @@ jobs:
       with:
         distribution: zulu
         java-version: 17
-    - name: Install R 4.3.3
+    - name: Install R 4.4.0
       uses: r-lib/actions/setup-r@v2
       with:
-        r-version: 4.3.3
+        r-version: 4.4.0
     - name: Install R dependencies
       run: |
         Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')"

diff --git a/LICENSE-binary b/LICENSE-binary
@@ -218,7 +218,6 @@ com.google.crypto.tink:tink
 com.google.flatbuffers:flatbuffers-java
 com.google.guava:guava
 com.jamesmurty.utils:java-xmlbuilder
-com.jolbox:bonecp
 com.ning:compress-lzf
 com.squareup.okhttp3:logging-interceptor
 com.squareup.okhttp3:okhttp

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -183,6 +183,54 @@ private static int lowercaseRFind(
     return MATCH_NOT_FOUND;
   }
 
+  /**
+   * Lowercase UTF8String comparison used for UTF8_BINARY_LCASE collation. While the default
+   * UTF8String comparison is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()), this
+   * method uses code points to compare the strings in a case-insensitive manner using ICU rules,
+   * as well as handling special rules for one-to-many case mappings (see: lowerCaseCodePoints).
+   *
+   * @param left The first UTF8String to compare.
+   * @param right The second UTF8String to compare.
+   * @return An integer representing the comparison result.
+   */
+  public static int compareLowerCase(final UTF8String left, final UTF8String right) {
+    // Only if both strings are ASCII, we can use faster comparison (no string allocations).
+    if (left.isFullAscii() && right.isFullAscii()) {
+      return compareLowerCaseAscii(left, right);
+    }
+    return compareLowerCaseSlow(left, right);
+  }
+
+  /**
+   * Fast version of the `compareLowerCase` method, used when both arguments are ASCII strings.
+   *
+   * @param left The first ASCII UTF8String to compare.
+   * @param right The second ASCII UTF8String to compare.
+   * @return An integer representing the comparison result.
+   */
+  private static int compareLowerCaseAscii(final UTF8String left, final UTF8String right) {
+    int leftBytes = left.numBytes(), rightBytes = right.numBytes();
+    for (int curr = 0; curr < leftBytes && curr < rightBytes; curr++) {
+      int lowerLeftByte = Character.toLowerCase(left.getByte(curr));
+      int lowerRightByte = Character.toLowerCase(right.getByte(curr));
+      if (lowerLeftByte != lowerRightByte) {
+        return lowerLeftByte - lowerRightByte;
+      }
+    }
+    return leftBytes - rightBytes;
+  }
+
+  /**
+   * Slow version of the `compareLowerCase` method, used when both arguments are non-ASCII strings.
+   *
+   * @param left The first non-ASCII UTF8String to compare.
+   * @param right The second non-ASCII UTF8String to compare.
+   * @return An integer representing the comparison result.
+   */
+  private static int compareLowerCaseSlow(final UTF8String left, final UTF8String right) {
+    return lowerCaseCodePoints(left.toString()).compareTo(lowerCaseCodePoints(right.toString()));
+  }
+
   public static UTF8String replace(final UTF8String src, final UTF8String search,
       final UTF8String replace, final int collationId) {
     // This collation aware implementation is based on existing implementation on UTF8String
@@ -284,18 +332,124 @@ public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String
     return buf.build();
   }
 
+  /**
+   * Convert the input string to uppercase using the ICU root locale rules.
+   *
+   * @param target the input string
+   * @return the uppercase string
+   */
+  public static UTF8String toUpperCase(final UTF8String target) {
+    return UTF8String.fromString(toUpperCase(target.toString()));
+  }
+
+  public static String toUpperCase(final String target) {
+    return UCharacter.toUpperCase(target);
+  }
+
+  /**
+   * Convert the input string to uppercase using the specified ICU collation rules.
+   *
+   * @param target the input string
+   * @return the uppercase string
+   */
+  public static UTF8String toUpperCase(final UTF8String target, final int collationId) {
+    return UTF8String.fromString(toUpperCase(target.toString(), collationId));
+  }
+
   public static String toUpperCase(final String target, final int collationId) {
     ULocale locale = CollationFactory.fetchCollation(collationId)
       .collator.getLocale(ULocale.ACTUAL_LOCALE);
     return UCharacter.toUpperCase(locale, target);
   }
 
+  /**
+   * Convert the input string to lowercase using the ICU root locale rules.
+   *
+   * @param target the input string
+   * @return the lowercase string
+   */
+  public static UTF8String toLowerCase(final UTF8String target) {
+    return UTF8String.fromString(toLowerCase(target.toString()));
+  }
+  public static String toLowerCase(final String target) {
+    return UCharacter.toLowerCase(target);
+  }
+
+  /**
+   * Convert the input string to lowercase using the specified ICU collation rules.
+   *
+   * @param target the input string
+   * @return the lowercase string
+   */
+  public static UTF8String toLowerCase(final UTF8String target, final int collationId) {
+    return UTF8String.fromString(toLowerCase(target.toString(), collationId));
+  }
   public static String toLowerCase(final String target, final int collationId) {
     ULocale locale = CollationFactory.fetchCollation(collationId)
       .collator.getLocale(ULocale.ACTUAL_LOCALE);
     return UCharacter.toLowerCase(locale, target);
   }
 
+  /**
+   * Converts a single code point to lowercase using ICU rules, with special handling for
+   * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase) and
+   * context-insensitive case mappings (i.e. characters that map to different characters based on
+   * string context - e.g. the position in the string relative to other characters).
+   *
+   * @param codePoint The code point to convert to lowercase.
+   * @param sb The StringBuilder to append the lowercase character to.
+   */
+  private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) {
+    if (codePoint == 0x0130) {
+      // Latin capital letter I with dot above is mapped to 2 lowercase characters.
+      sb.appendCodePoint(0x0069);
+      sb.appendCodePoint(0x0307);
+    }
+    else if (codePoint == 0x03C2) {
+      // Greek final and non-final capital letter sigma should be mapped the same.
+      sb.appendCodePoint(0x03C3);
+    }
+    else {
+      // All other characters should follow context-unaware ICU single-code point case mapping.
+      sb.appendCodePoint(UCharacter.toLowerCase(codePoint));
+    }
+  }
+
+  /**
+   * Converts an entire string to lowercase using ICU rules, code point by code point, with
+   * special handling for one-to-many case mappings (i.e. characters that map to multiple
+   * characters in lowercase). Also, this method omits information about context-sensitive case
+   * mappings using special handling in the `lowercaseCodePoint` method.
+   *
+   * @param target The target string to convert to lowercase.
+   * @return The string converted to lowercase in a context-unaware manner.
+   */
+  public static String lowerCaseCodePoints(final String target) {
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < target.length(); ++i) {
+      lowercaseCodePoint(target.codePointAt(i), sb);
+    }
+    return sb.toString();
+  }
+
+  /**
+   * Convert the input string to titlecase using the ICU root locale rules.
+   */
+  public static UTF8String toTitleCase(final UTF8String target) {
+    return UTF8String.fromString(toTitleCase(target.toString()));
+  }
+
+  public static String toTitleCase(final String target) {
+    return UCharacter.toTitleCase(target, BreakIterator.getWordInstance());
+  }
+
+  /**
+   * Convert the input string to titlecase using the specified ICU collation rules.
+   */
+  public static UTF8String toTitleCase(final UTF8String target, final int collationId) {
+    return UTF8String.fromString(toTitleCase(target.toString(), collationId));
+  }
+
   public static String toTitleCase(final String target, final int collationId) {
     ULocale locale = CollationFactory.fetchCollation(collationId)
       .collator.getLocale(ULocale.ACTUAL_LOCALE);
@@ -351,9 +505,8 @@ public static int lowercaseIndexOf(final UTF8String target, final UTF8String pat
 
   public static int indexOf(final UTF8String target, final UTF8String pattern,
       final int start, final int collationId) {
-    if (pattern.numBytes() == 0) {
-      return target.indexOfEmpty(start);
-    }
+    if (pattern.numBytes() == 0) return target.indexOfEmpty(start);
+    if (target.numBytes() == 0) return MATCH_NOT_FOUND;
 
     StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId);
     stringSearch.setIndex(start);

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -412,9 +412,9 @@ protected Collation buildCollation() {
             "UTF8_BINARY_LCASE",
             PROVIDER_SPARK,
             null,
-            UTF8String::compareLowerCase,
+            CollationAwareUTF8String::compareLowerCase,
             "1.0",
-            s -> (long) s.toLowerCase().hashCode(),
+            s -> (long) CollationAwareUTF8String.lowerCaseCodePoints(s.toString()).hashCode(),
             /* supportsBinaryEquality = */ false,
             /* supportsBinaryOrdering = */ false,
             /* supportsLowercaseEquality = */ true);
@@ -671,7 +671,7 @@ protected Collation buildCollation() {
           (s1, s2) -> collator.compare(s1.toString(), s2.toString()),
           ICU_COLLATOR_VERSION,
           s -> (long) collator.getCollationKey(s.toString()).hashCode(),
-          /* supportsBinaryEquality = */ collationId == UNICODE_COLLATION_ID,
+          /* supportsBinaryEquality = */ false,
           /* supportsBinaryOrdering = */ false,
           /* supportsLowercaseEquality = */ false);
       }

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -208,60 +208,76 @@ public static boolean execICU(final UTF8String l, final UTF8String r,
   public static class Upper {
     public static UTF8String exec(final UTF8String v, final int collationId) {
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
-      if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
-        return execUTF8(v);
-      } else {
+      if (collation.supportsBinaryEquality) {
+        return execBinary(v);
+      } else if (collation.supportsLowercaseEquality) {
+        return execLowercase(v);
+      }  else {
         return execICU(v, collationId);
       }
     }
     public static String genCode(final String v, final int collationId) {
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
       String expr = "CollationSupport.Upper.exec";
-      if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
-        return String.format(expr + "UTF8(%s)", v);
-      } else {
+      if (collation.supportsBinaryEquality) {
+        return String.format(expr + "Binary(%s)", v);
+      } else if (collation.supportsLowercaseEquality) {
+        return String.format(expr + "Lowercase(%s)", v);
+      }  else {
         return String.format(expr + "ICU(%s, %d)", v, collationId);
       }
     }
-    public static UTF8String execUTF8(final UTF8String v) {
+    public static UTF8String execBinary(final UTF8String v) {
       return v.toUpperCase();
     }
+    public static UTF8String execLowercase(final UTF8String v) {
+      return CollationAwareUTF8String.toUpperCase(v);
+    }
     public static UTF8String execICU(final UTF8String v, final int collationId) {
-      return UTF8String.fromString(CollationAwareUTF8String.toUpperCase(v.toString(), collationId));
+      return CollationAwareUTF8String.toUpperCase(v, collationId);
     }
   }
 
   public static class Lower {
     public static UTF8String exec(final UTF8String v, final int collationId) {
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
-      if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
-        return execUTF8(v);
+      if (collation.supportsBinaryEquality) {
+        return execBinary(v);
+      } else if (collation.supportsLowercaseEquality) {
+        return execLowercase(v);
       } else {
         return execICU(v, collationId);
       }
     }
     public static String genCode(final String v, final int collationId) {
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
         String expr = "CollationSupport.Lower.exec";
-      if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
-        return String.format(expr + "UTF8(%s)", v);
-      } else {
+      if (collation.supportsBinaryEquality) {
+        return String.format(expr + "Binary(%s)", v);
+      } else if (collation.supportsLowercaseEquality) {
+        return String.format(expr + "Lowercase(%s)", v);
+      }  else {
         return String.format(expr + "ICU(%s, %d)", v, collationId);
       }
     }
-    public static UTF8String execUTF8(final UTF8String v) {
+    public static UTF8String execBinary(final UTF8String v) {
       return v.toLowerCase();
     }
+    public static UTF8String execLowercase(final UTF8String v) {
+      return CollationAwareUTF8String.toLowerCase(v);
+    }
     public static UTF8String execICU(final UTF8String v, final int collationId) {
-      return UTF8String.fromString(CollationAwareUTF8String.toLowerCase(v.toString(), collationId));
+      return CollationAwareUTF8String.toLowerCase(v, collationId);
     }
   }
 
   public static class InitCap {
     public static UTF8String exec(final UTF8String v, final int collationId) {
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
-      if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
-        return execUTF8(v);
+      if (collation.supportsBinaryEquality) {
+        return execBinary(v);
+      } else if (collation.supportsLowercaseEquality) {
+        return execLowercase(v);
       } else {
         return execICU(v, collationId);
       }
@@ -270,25 +286,22 @@ public static UTF8String exec(final UTF8String v, final int collationId) {
     public static String genCode(final String v, final int collationId) {
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
       String expr = "CollationSupport.InitCap.exec";
-      if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
-        return String.format(expr + "UTF8(%s)", v);
+      if (collation.supportsBinaryEquality) {
+        return String.format(expr + "Binary(%s)", v);
+      } else if (collation.supportsLowercaseEquality) {
+        return String.format(expr + "Lowercase(%s)", v);
       } else {
         return String.format(expr + "ICU(%s, %d)", v, collationId);
       }
     }
-
-    public static UTF8String execUTF8(final UTF8String v) {
+    public static UTF8String execBinary(final UTF8String v) {
       return v.toLowerCase().toTitleCase();
     }
-
+    public static UTF8String execLowercase(final UTF8String v) {
+      return CollationAwareUTF8String.toTitleCase(v);
+    }
     public static UTF8String execICU(final UTF8String v, final int collationId) {
-      return UTF8String.fromString(
-              CollationAwareUTF8String.toTitleCase(
-                      CollationAwareUTF8String.toLowerCase(
-                              v.toString(),
-                              collationId
-                      ),
-                      collationId));
+      return CollationAwareUTF8String.toTitleCase(v, collationId);
     }
   }