holdenk · holdenk · Feb 18, 2022 · Feb 18, 2022 · Feb 18, 2022 · Feb 18, 2022
diff --git a/python/pyspark/ml/image.pyi → .github/workflows/ansi_sql_mode_test.yml b/python/pyspark/ml/image.pyi → .github/workflows/ansi_sql_mode_test.yml
@@ -15,26 +15,20 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+#
 
-from typing import Dict, List
+name: ANSI SQL mode test
 
-from pyspark.sql.types import Row, StructType
+on:
+  push:
+    branches:
+      - master
 
-from numpy import ndarray
+jobs:
+  ansi_sql_test:
+    uses: ./.github/workflows/build_and_test.yml
+    if: github.repository == 'apache/spark'
+    with:
+      ansi_enabled: true
 
-class _ImageSchema:
-    def __init__(self) -> None: ...
-    @property
-    def imageSchema(self) -> StructType: ...
-    @property
-    def ocvTypes(self) -> Dict[str, int]: ...
-    @property
-    def columnSchema(self) -> StructType: ...
-    @property
-    def imageFields(self) -> List[str]: ...
-    @property
-    def undefinedImageType(self) -> str: ...
-    def toNDArray(self, image: Row) -> ndarray: ...
-    def toImage(self, array: ndarray, origin: str = ...) -> Row: ...
 
-ImageSchema: _ImageSchema
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -37,6 +37,12 @@ on:
     - cron: '0 13 * * *'
     # Java 17
     - cron: '0 16 * * *'
+  workflow_call:
+    inputs:
+      ansi_enabled:
+        required: false
+        type: boolean
+        default: false
 
 jobs:
   configure-jobs:
@@ -92,7 +98,7 @@ jobs:
           echo '::set-output name=java::8'
           echo '::set-output name=branch::master' # Default branch to run on. CHANGE here when a branch is cut out.
           echo '::set-output name=type::regular'
-          echo '::set-output name=envs::{}'
+          echo '::set-output name=envs::{"SPARK_ANSI_SQL_MODE": "${{ inputs.ansi_enabled }}"}'
           echo '::set-output name=hadoop::hadoop3'
         fi
 
@@ -252,7 +258,7 @@ jobs:
     - name: Install Python packages (Python 3.8)
       if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
       run: |
-        python3.8 -m pip install 'numpy>=1.20.0' 'pyarrow<5.0.0' pandas scipy xmlrunner
+        python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy xmlrunner
         python3.8 -m pip list
     # Run the tests.
     - name: Run tests
@@ -287,7 +293,7 @@ jobs:
     name: "Build modules (${{ format('{0}, {1} job', needs.configure-jobs.outputs.branch, needs.configure-jobs.outputs.type) }}): ${{ matrix.modules }}"
     runs-on: ubuntu-20.04
     container:
-      image: dongjoon/apache-spark-github-action-image:20211228
+      image: dongjoon/apache-spark-github-action-image:20220207
     strategy:
       fail-fast: false
       matrix:
@@ -311,6 +317,7 @@ jobs:
       SKIP_UNIDOC: true
       SKIP_MIMA: true
       METASPACE_SIZE: 1g
+      SPARK_ANSI_SQL_MODE: ${{ inputs.ansi_enabled }}
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -391,13 +398,14 @@ jobs:
     name: "Build modules: sparkr"
     runs-on: ubuntu-20.04
     container:
-      image: dongjoon/apache-spark-github-action-image:20211228
+      image: dongjoon/apache-spark-github-action-image:20220207
     env:
       HADOOP_PROFILE: ${{ needs.configure-jobs.outputs.hadoop }}
       HIVE_PROFILE: hive2.3
       GITHUB_PREV_SHA: ${{ github.event.before }}
       SPARK_LOCAL_IP: localhost
       SKIP_MIMA: true
+      SPARK_ANSI_SQL_MODE: ${{ inputs.ansi_enabled }}
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -462,7 +470,7 @@ jobs:
       PYSPARK_DRIVER_PYTHON: python3.9
       PYSPARK_PYTHON: python3.9
     container:
-      image: dongjoon/apache-spark-github-action-image:20211228
+      image: dongjoon/apache-spark-github-action-image:20220207
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
@@ -529,11 +537,14 @@ jobs:
         #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
         # Jinja2 3.0.0+ causes error when building with Sphinx.
         #   See also https://issues.apache.org/jira/browse/SPARK-35375.
-        python3.9 -m pip install 'sphinx<3.1.0' mkdocs pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0'
-        python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' 'pyarrow<5.0.0' pandas 'plotly>=4.8'
+        # Pin the MarkupSafe to 2.0.1 to resolve the CI error.
+        #   See also https://issues.apache.org/jira/browse/SPARK-38279.
+        python3.9 -m pip install 'sphinx<3.1.0' mkdocs pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0' 'markupsafe==2.0.1'
+        python3.9 -m pip install ipython_genutils # See SPARK-38517
+        python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 
         apt-get update -y
         apt-get install -y ruby ruby-dev
-        Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
+        Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2'), repos='https://cloud.r-project.org/')"
         Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')"
         Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')"
         gem install bundler
@@ -614,7 +625,7 @@ jobs:
         export MAVEN_CLI_OPTS="--no-transfer-progress"
         export JAVA_VERSION=${{ matrix.java }}
         # It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414.
-        ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=${JAVA_VERSION/-ea} install
+        ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=${JAVA_VERSION/-ea} install
         rm -rf ~/.m2/repository/org/apache/spark
 
   scala-213:
@@ -660,7 +671,7 @@ jobs:
     - name: Build with SBT
       run: |
         ./dev/change-scala-version.sh 2.13
-        ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile
+        ./build/sbt -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile
 
   tpcds-1g:
     needs: [configure-jobs, precondition]
@@ -669,6 +680,7 @@ jobs:
     runs-on: ubuntu-20.04
     env:
       SPARK_LOCAL_IP: localhost
+      SPARK_ANSI_SQL_MODE: ${{ inputs.ansi_enabled }}
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2

diff --git a/.github/workflows/notify_test_workflow.yml b/.github/workflows/notify_test_workflow.yml
@@ -38,12 +38,19 @@ jobs:
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           script: |
-            const endpoint = 'GET /repos/:owner/:repo/commits/:ref/check-runs'
+            const endpoint = 'GET /repos/:owner/:repo/actions/workflows/:id/runs?&branch=:branch'
+            const check_run_endpoint = 'GET /repos/:owner/:repo/commits/:ref/check-runs'
 
             // TODO: Should use pull_request.user and pull_request.user.repos_url?
             // If a different person creates a commit to another forked repo,
             // it wouldn't be able to detect.
             const params = {
+              owner: context.payload.pull_request.head.repo.owner.login,
+              repo: context.payload.pull_request.head.repo.name,
+              id: 'build_and_test.yml',
+              branch: context.payload.pull_request.head.ref,
+            }
+            const check_run_params = {
               owner: context.payload.pull_request.head.repo.owner.login,
               repo: context.payload.pull_request.head.repo.name,
               ref: context.payload.pull_request.head.ref,
@@ -67,7 +74,7 @@ jobs:
             const head_sha = context.payload.pull_request.head.sha
             let status = 'queued'
 
-            if (!runs || runs.data.check_runs.filter(r => r.name === "Configure jobs").length === 0) {
+            if (!runs || runs.data.workflow_runs.length === 0) {
               status = 'completed'
               const conclusion = 'action_required'
 
@@ -99,16 +106,29 @@ jobs:
                 }
               })
             } else {
-              const runID = runs.data.check_runs.filter(r => r.name === "Configure jobs")[0].id
+              const run_id = runs.data.workflow_runs[0].id
 
-              if (runs.data.check_runs[0].head_sha != context.payload.pull_request.head.sha) {
+              if (runs.data.workflow_runs[0].head_sha != context.payload.pull_request.head.sha) {
                 throw new Error('There was a new unsynced commit pushed. Please retrigger the workflow.');
               }
 
-              const runUrl = 'https://github.com/'
+              // Here we get check run ID to provide Check run view instead of Actions view, see also SPARK-37879.
+              const check_runs = await github.request(check_run_endpoint, check_run_params)
+              const check_run_head = check_runs.data.check_runs.filter(r => r.name === "Configure jobs")[0]
+
+              if (check_run_head.head_sha != context.payload.pull_request.head.sha) {
+                throw new Error('There was a new unsynced commit pushed. Please retrigger the workflow.');
+              }
+
+              const check_run_url = 'https://github.com/'
                 + context.payload.pull_request.head.repo.full_name
                 + '/runs/'
-                + runID
+                + check_run_head.id
+
+              const actions_url = 'https://github.com/'
+                + context.payload.pull_request.head.repo.full_name
+                + '/actions/runs/'
+                + run_id
 
               github.checks.create({
                 owner: context.repo.owner,
@@ -118,13 +138,13 @@ jobs:
                 status: status,
                 output: {
                   title: 'Test results',
-                  summary: '[See test results](' + runUrl + ')',
+                  summary: '[See test results](' + check_run_url + ')',
                   text: JSON.stringify({
                     owner: context.payload.pull_request.head.repo.owner.login,
                     repo: context.payload.pull_request.head.repo.name,
-                    run_id: runID
+                    run_id: run_id
                   })
                 },
-                details_url: runUrl,
+                details_url: actions_url,
               })
             }
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,8 @@
 *~
 .java-version
 .DS_Store
+.ammonite
+.bloop
 .bsp/
 .cache
 .classpath
@@ -21,10 +23,12 @@
 # SPARK-35223: Add IssueNavigationLink to make IDEA support hyperlink on JIRA Ticket and GitHub PR on Git plugin.
 !.idea/vcs.xml
 .idea_modules/
+.metals
 .project
 .pydevproject
 .scala_dependencies
 .settings
+.vscode
 /lib/
 R-unit-tests.log
 R/unit-tests.out
@@ -59,6 +63,7 @@ lint-r-report.log
 lint-js-report.log
 log/
 logs/
+metals.sbt
 out/
 project/boot/
 project/build/target/

diff --git a/LICENSE-binary b/LICENSE-binary
@@ -456,6 +456,7 @@ net.sf.py4j:py4j
 org.jpmml:pmml-model
 org.jpmml:pmml-schema
 org.threeten:threeten-extra
+org.jdom:jdom2
 
 python/lib/py4j-*-src.zip
 python/pyspark/cloudpickle.py
@@ -504,6 +505,7 @@ Common Development and Distribution License (CDDL) 1.0
 javax.activation:activation  http://www.oracle.com/technetwork/java/javase/tech/index-jsp-138795.html
 javax.xml.stream:stax-api    https://jcp.org/en/jsr/detail?id=173
 javax.transaction:javax.transaction-api
+javax.xml.bind:jaxb-api
 
 
 Common Development and Distribution License (CDDL) 1.1

diff --git a/NOTICE-binary b/NOTICE-binary
@@ -917,6 +917,9 @@ This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin
 g Package (jaspell): http://jaspell.sourceforge.net/
 License: The BSD License (http://www.opensource.org/licenses/bsd-license.php)
 
+This product includes software developed by the JDOM Project (http://www.jdom.org/)
+License: https://raw.githubusercontent.com/hunterhacker/jdom/master/LICENSE.txt
+
 The snowball stemmers in
   analysis/common/src/java/net/sf/snowball
 were developed by Martin Porter and Richard Boulton.

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -60,7 +60,7 @@ Collate:
     'types.R'
     'utils.R'
     'window.R'
-RoxygenNote: 7.1.1
+RoxygenNote: 7.1.2
 VignetteBuilder: knitr
 NeedsCompilation: no
 Encoding: UTF-8
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1690,9 +1690,9 @@ test_that("column functions", {
 
   df <- as.DataFrame(list(list("col" = "1")))
   c <- collect(select(df, schema_of_csv("Amsterdam,2018")))
-  expect_equal(c[[1]], "STRUCT<`_c0`: STRING, `_c1`: INT>")
+  expect_equal(c[[1]], "STRUCT<_c0: STRING, _c1: INT>")
   c <- collect(select(df, schema_of_csv(lit("Amsterdam,2018"))))
-  expect_equal(c[[1]], "STRUCT<`_c0`: STRING, `_c1`: INT>")
+  expect_equal(c[[1]], "STRUCT<_c0: STRING, _c1: INT>")
 
   # Test to_json(), from_json(), schema_of_json()
   df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
@@ -1725,9 +1725,9 @@ test_that("column functions", {
 
   df <- as.DataFrame(list(list("col" = "1")))
   c <- collect(select(df, schema_of_json('{"name":"Bob"}')))
-  expect_equal(c[[1]], "STRUCT<`name`: STRING>")
+  expect_equal(c[[1]], "STRUCT<name: STRING>")
   c <- collect(select(df, schema_of_json(lit('{"name":"Bob"}'))))
-  expect_equal(c[[1]], "STRUCT<`name`: STRING>")
+  expect_equal(c[[1]], "STRUCT<name: STRING>")
 
   # Test to_json() supports arrays of primitive types and arrays
   df <- sql("SELECT array(19, 42, 70) as age")
@@ -2051,13 +2051,19 @@ test_that("date functions on a DataFrame", {
 })
 
 test_that("SPARK-37108: expose make_date expression in R", {
+  ansiEnabled <- sparkR.conf("spark.sql.ansi.enabled")[[1]] == "true"
   df <- createDataFrame(
-    list(list(2021, 10, 22), list(2021, 13, 1),
-         list(2021, 2, 29), list(2020, 2, 29)),
+    c(
+      list(list(2021, 10, 22), list(2020, 2, 29)),
+      if (ansiEnabled) list() else list(list(2021, 13, 1), list(2021, 2, 29))
+    ),
     list("year", "month", "day")
   )
   expect <- createDataFrame(
-    list(list(as.Date("2021-10-22")), NA, NA, list(as.Date("2020-02-29"))),
+    c(
+      list(list(as.Date("2021-10-22")), list(as.Date("2020-02-29"))),
+      if (ansiEnabled) list() else list(NA, NA)
+    ),
     list("make_date(year, month, day)")
   )
   actual <- select(df, make_date(df$year, df$month, df$day))

diff --git a/bin/pyspark b/bin/pyspark
@@ -50,7 +50,7 @@ export PYSPARK_DRIVER_PYTHON_OPTS
 
 # Add the PySpark classes to the Python path:
 export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
-export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.3-src.zip:$PYTHONPATH"
+export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.4-src.zip:$PYTHONPATH"
 
 # Load the PySpark shell.py script when ./pyspark is used interactively:
 export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"

diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
@@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
 )
 
 set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
-set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9.3-src.zip;%PYTHONPATH%
+set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9.4-src.zip;%PYTHONPATH%
 
 set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
 set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py

diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/ArrayWrappers.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/ArrayWrappers.java
@@ -200,7 +200,7 @@ public int hashCode() {
     public int compareTo(ComparableObjectArray other) {
       int len = Math.min(array.length, other.array.length);
       for (int i = 0; i < len; i++) {
-        int diff = ((Comparable<Object>) array[i]).compareTo((Comparable<Object>) other.array[i]);
+        int diff = ((Comparable<Object>) array[i]).compareTo(other.array[i]);
         if (diff != 0) {
           return diff;
         }

diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java
@@ -329,13 +329,14 @@ private int countKeys(Class<?> type) throws Exception {
     byte[] prefix = db.getTypeInfo(type).keyPrefix();
     int count = 0;
 
-    DBIterator it = db.db().iterator();
-    it.seek(prefix);
-
-    while (it.hasNext()) {
-      byte[] key = it.next().getKey();
-      if (LevelDBIterator.startsWith(key, prefix)) {
-        count++;
+    try (DBIterator it = db.db().iterator()) {
+      it.seek(prefix);
+
+      while (it.hasNext()) {
+        byte[] key = it.next().getKey();
+        if (LevelDBIterator.startsWith(key, prefix)) {
+          count++;
+        }
       }
     }